diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 936ef7a7ff..8893f065ea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_%(model_name)s.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 22acd3abe9..b13b6e9edc 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h new file mode 100644 index 0000000000..a4777347d0 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_%(processid_uppercase)s_H +#define MG5_CONFIG_%(processid_uppercase)s_H 1 + +namespace processConfig { + + constexpr int ndiagrams = %(ndiagrams)d; + +} + +#endif // MG5_CONFIG_%(processid_uppercase)s_H \ No newline at end of file diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 4c35c3eec6..d742565283 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -25,6 +25,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 0665bfb93b..dccdf2e736 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -16,6 +16,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -384,9 +447,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -462,8 +524,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -525,25 +586,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -588,16 +659,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -665,6 +764,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -674,6 +774,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -685,8 +787,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -714,7 +818,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 7de8886b1d..90ffe70624 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -86,6 +86,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -94,6 +95,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -110,6 +113,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -117,6 +121,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 4372edde52..aa7ad1165e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -17,7 +17,7 @@ gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -31,11 +31,14 @@ fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -47,6 +50,7 @@ #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -55,9 +59,10 @@ const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -68,9 +73,15 @@ // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -108,40 +119,6 @@ const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -156,7 +133,8 @@ cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -204,82 +182,97 @@ } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV %% neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV %% neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%%d icol=%%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -304,13 +297,7 @@ // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -318,13 +305,14 @@ fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 3f8a85afa6..ef2ea6baf3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1323,7 +1323,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -1332,7 +1332,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -1451,6 +1451,7 @@ def generate_process_files(self): self.edit_check_sa() self.edit_mgonGPU() self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) + self.edit_processConfig() # sub process specific, not to be symlinked from the Subprocesses directory self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses) self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) @@ -1543,6 +1544,17 @@ def edit_colorsum(self): ff = open(pjoin(self.path, 'color_sum.cc'),'w') ff.write(template % replace_dict) ff.close() + + def edit_processConfig(self): + """Generate process_config.h""" + ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_processConfig') + template = open(pjoin(self.template_path,'gpu','processConfig.h'),'r').read() + replace_dict = {} + replace_dict['ndiagrams'] = len(self.matrix_elements[0].get('diagrams')) + replace_dict['processid_uppercase'] = self.get_process_name().upper() + ff = open(pjoin(self.path, 'processConfig.h'),'w') + ff.write(template % replace_dict) + ff.close() def generate_subprocess_directory_end(self, **opt): """ opt contain all local variable of the fortran original function""" @@ -1926,7 +1938,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -1939,7 +1952,7 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -1948,12 +1961,8 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif""") diagrams = matrix_element.get('diagrams') @@ -1985,8 +1994,12 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472 ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472 res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") - res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diagram.get('number')) - res.append("if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );") + diagnum = diagram.get('number') + res.append("if( storeChannelWeights )") + res.append("{") + res.append(" numerators_sv[%i] += cxabs2( amp_sv[0] );" % (diagnum-1)) + res.append(" denominators_sv += cxabs2( amp_sv[0] );") + res.append("}") res.append("#endif") else: res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL") diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index e54290d5a7..6669d53123 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -123,6 +123,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'gpu/testmisc.cc', s+'gpu/testxxx_cc_ref.txt', s+'gpu/valgrind.h', s+'gpu/perf.py', s+'gpu/profile.sh', s+'gpu/cudacpp_overlay.mk', s+'gpu/makefile_wrapper.mk', + s+'gpu/umami.h', s+'gpu/umami.cc', s+'CMake/SubProcesses/CMakeLists.txt'], 'test': [s+'gpu/cudacpp_test.mk']} @@ -151,6 +152,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'testxxx.cc', # this is generated from a template in Subprocesses but we still link it in P1 'MemoryBuffers.h', # this is generated from a template in Subprocesses but we still link it in P1 'MemoryAccessCouplings.h', # this is generated from a template in Subprocesses but we still link it in P1 + 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] # AV - use template files from PLUGINDIR instead of MG5DIR and change their names @@ -264,7 +266,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): done""" try: result = subprocess.run( - ["bash", "-lc", patch_coupl_write], + ["bash", "-c", patch_coupl_write], cwd=pjoin(self.dir_path, "Source", "MODEL"), text=True, capture_output=True, diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b7cdf09c17..cac2fc9257 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004445075988769531  +DEBUG: model prefixing takes 0.0017848014831542969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -149,21 +150,21 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.007 s Total: 1 processes with 2 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 @@ -175,22 +176,22 @@ FileWriter mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.060 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s +Wrote files for 8 helas calls in 1.876 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.170 s +ALOHA: aloha creates 3 routines in 0.129 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.184 s +ALOHA: aloha creates 7 routines in 0.131 s FFV1 FFV1 FFV2 @@ -199,32 +200,34 @@ ALOHA: aloha creates 7 routines in 0.184 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.396s -user 0m1.798s -sys 0m0.425s -Code generation completed in 2 seconds +real 0m9.417s +user 0m1.278s +sys 0m0.670s +Code generation completed in 10 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -245,12 +248,14 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run +/shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/madgraph/various/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ quit INFO: launch in debug mode @@ -274,9 +279,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 2450ec54f8..a536b3d076 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -404,8 +468,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV2_4_0( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -719,9 +786,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -797,8 +863,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -860,25 +925,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -923,16 +998,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1000,6 +1103,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1009,6 +1113,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1020,8 +1126,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1049,7 +1157,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1065,7 +1172,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1079,11 +1186,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1095,6 +1205,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1103,9 +1214,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1116,9 +1228,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1156,40 +1274,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1204,7 +1288,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1252,82 +1337,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1352,13 +1452,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1366,13 +1460,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index c9d280d0f6..3ca62dbc6e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h new file mode 100644 index 0000000000..65cfee8266 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H +#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 3c991f09cf..6cd2239516 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -1,8 +1,8 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 1.185530662536621) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004302024841308594  +DEBUG: model prefixing takes 0.001810312271118164  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,32 +150,32 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.003 s +1 processes with 2 diagrams generated in 0.006 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. +Generated helas calls for 1 subprocesses (2 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.190 s +ALOHA: aloha creates 4 routines in 0.122 s FFV1 FFV1 FFV2 @@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in 0.190 s FFV4 FFV2_4 FFV2_4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m1.709s -user 0m1.562s -sys 0m0.115s -Code generation completed in 2 seconds +real 0m2.084s +user 0m0.455s +sys 0m0.171s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index af61f3ea74..ec11e4c04b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -717,9 +778,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -795,8 +855,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -858,25 +917,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -921,16 +990,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -998,6 +1095,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1007,6 +1105,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1018,8 +1118,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1047,7 +1149,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1063,7 +1164,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1077,11 +1178,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1093,6 +1197,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1101,9 +1206,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1114,9 +1220,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1154,40 +1266,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1202,7 +1280,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1250,82 +1329,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1350,13 +1444,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1364,13 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index c9d280d0f6..3ca62dbc6e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h new file mode 100644 index 0000000000..65cfee8266 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H +#define MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_EPEM_MUPMUM_H \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 156f7ce8e7..b485abf77b 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004584789276123047  +DEBUG: model prefixing takes 0.0018193721771240234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.007 s +1 processes with 3 diagrams generated in 0.015 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -176,49 +177,51 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  Generated helas calls for 1 subprocesses (3 diagrams) in 0.009 s -Wrote files for 10 helas calls in 0.078 s +Wrote files for 10 helas calls in 1.922 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.103 s +ALOHA: aloha creates 2 routines in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.088 s +ALOHA: aloha creates 4 routines in 0.069 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m2.028s -user 0m1.664s -sys 0m0.358s -Code generation completed in 2 seconds +real 0m8.946s +user 0m1.272s +sys 0m0.626s +Code generation completed in 9 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -239,9 +242,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -268,9 +271,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 55167ebaf3..4204e595d8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -732,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -810,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -873,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -936,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1013,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1022,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1033,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1062,7 +1173,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1078,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1092,11 +1202,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1108,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1116,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1129,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,40 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1217,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1265,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1365,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1379,13 +1476,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 1f90d3c408..d5886a1099 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004430294036865234  +DEBUG: model prefixing takes 0.0018737316131591797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.006 s +1 processes with 3 diagrams generated in 0.011 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.121 s +ALOHA: aloha creates 2 routines in 0.065 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.508s -user 0m0.439s -sys 0m0.064s -Code generation completed in 1 seconds +real 0m1.735s +user 0m0.348s +sys 0m0.112s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 2b06bb84d0..bbc2c6c17c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -729,9 +790,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -807,8 +867,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -870,25 +929,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -933,16 +1002,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1010,6 +1107,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1019,6 +1117,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1030,8 +1130,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1059,7 +1161,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1075,7 +1176,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1089,11 +1190,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1105,6 +1209,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1113,9 +1218,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1126,9 +1232,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1166,40 +1278,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1214,7 +1292,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1262,82 +1341,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1362,13 +1456,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1376,13 +1464,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 0af9646028..c6e2bc2275 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -1,8 +1,8 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 0.5061478614807129) Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT ************************************************************ * * @@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -58,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.01866316795349121  +DEBUG: model prefixing takes 0.00186920166015625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -151,7 +151,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.012 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.025 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 INFO: Processing color information for process: g g > t t~ g @2 @@ -187,9 +187,9 @@ FileWriter t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -198,25 +198,25 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (19 diagrams) in 0.088 s -Wrote files for 46 helas calls in 0.403 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  +Generated helas calls for 2 subprocesses (19 diagrams) in 0.023 s +Wrote files for 46 helas calls in 4.525 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.419 s +ALOHA: aloha creates 5 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.553 s +ALOHA: aloha creates 10 routines in 0.166 s VVV1 VVV1 FFV1 @@ -226,32 +226,34 @@ ALOHA: aloha creates 10 routines in 0.553 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.986s -user 0m4.846s -sys 0m0.948s -Code generation completed in 6 seconds +real 0m12.237s +user 0m1.681s +sys 0m0.791s +Code generation completed in 12 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -272,9 +274,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -301,9 +303,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 55167ebaf3..4204e595d8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -732,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -810,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -873,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -936,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1013,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1022,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1033,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1062,7 +1173,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1078,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1092,11 +1202,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1108,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1116,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1129,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,40 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1217,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1265,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1365,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1379,13 +1476,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 26652fc739..5fdf36bb26 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index f6e0894592..e32a6ef9b4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +509,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +642,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +677,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +693,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -949,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1027,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1090,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1153,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1230,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1239,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1250,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1279,7 +1426,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1295,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1309,11 +1455,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1325,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1333,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1346,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1386,40 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1434,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1482,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1582,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1596,13 +1729,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index a0fbcbb773..a49500a023 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index e50d05daa6..d7b2672731 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004053354263305664  +DEBUG: model prefixing takes 0.0017828941345214844  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.016 s +1 processes with 16 diagrams generated in 0.024 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 @@ -176,25 +177,25 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (16 diagrams) in 0.030 s -Wrote files for 36 helas calls in 0.096 s +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.016 s +Wrote files for 36 helas calls in 2.297 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.242 s +ALOHA: aloha creates 5 routines in 0.185 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.216 s +ALOHA: aloha creates 10 routines in 0.172 s VVV1 VVV1 FFV1 @@ -204,32 +205,34 @@ ALOHA: aloha creates 10 routines in 0.216 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.399s -user 0m2.037s -sys 0m0.357s -Code generation completed in 3 seconds +real 0m9.939s +user 0m1.538s +sys 0m0.629s +Code generation completed in 10 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -250,9 +253,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -279,9 +282,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 7f29af7755..b8f69df605 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +509,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +642,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +677,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +693,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -949,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1027,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1090,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1153,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1230,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1239,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1250,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1279,7 +1426,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1295,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1309,11 +1455,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1325,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1333,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1346,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1386,40 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1434,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1482,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1582,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1596,13 +1729,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ab60b4e5bd..c635672d98 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0042188167572021484  +DEBUG: model prefixing takes 0.0044193267822265625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.017 s +1 processes with 16 diagrams generated in 0.025 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.029 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. +Generated helas calls for 1 subprocesses (16 diagrams) in 0.015 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.230 s +ALOHA: aloha creates 5 routines in 0.161 s VVV1 VVV1 FFV1 @@ -186,17 +187,17 @@ ALOHA: aloha creates 5 routines in 0.230 s VVVV1 VVVV3 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.642s -user 0m0.586s -sys 0m0.050s -Code generation completed in 1 seconds +real 0m2.175s +user 0m0.523s +sys 0m0.144s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 3897ffd9b4..45ea024451 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -943,9 +1004,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1021,8 +1081,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1084,25 +1143,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1147,16 +1216,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1224,6 +1321,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1233,6 +1331,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1244,8 +1344,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1273,7 +1375,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1289,7 +1390,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1303,11 +1404,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1319,6 +1423,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1327,9 +1432,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1340,9 +1446,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1380,40 +1492,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1428,7 +1506,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1476,82 +1555,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1576,13 +1670,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1590,13 +1678,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8c941153c6..b697a4a0e0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004433155059814453  +DEBUG: model prefixing takes 0.0018012523651123047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.125 s +1 processes with 123 diagrams generated in 0.085 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 @@ -176,25 +177,25 @@ FileWriter t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.307 s -Wrote files for 222 helas calls in 0.475 s +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.154 s +Wrote files for 222 helas calls in 3.038 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.280 s +ALOHA: aloha creates 5 routines in 0.194 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.246 s +ALOHA: aloha creates 10 routines in 0.167 s VVV1 VVV1 FFV1 @@ -207,32 +208,34 @@ ALOHA: aloha creates 10 routines in 0.246 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.426s -user 0m3.041s -sys 0m0.376s -Code generation completed in 4 seconds +real 0m10.227s +user 0m2.050s +sys 0m0.643s +Code generation completed in 10 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -253,9 +256,9 @@ Code generation completed in 4 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -282,9 +285,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 6664e7c6fc..ca0360d110 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -430,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -450,8 +514,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -470,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -491,8 +561,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -505,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -521,8 +597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -535,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -549,8 +631,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -565,8 +650,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -579,8 +667,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -593,8 +684,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -609,8 +703,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -623,8 +720,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -639,8 +739,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -655,8 +758,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -673,8 +779,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -686,8 +795,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -699,8 +811,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -714,8 +829,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -730,8 +848,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -744,8 +865,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -758,8 +882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -774,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -788,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -802,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -815,8 +951,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -828,8 +967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -841,8 +983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -854,8 +999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -868,8 +1016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -910,8 +1061,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -923,8 +1077,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -936,8 +1093,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -950,8 +1110,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -966,8 +1129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -980,8 +1146,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -994,8 +1163,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -1010,8 +1182,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1024,8 +1199,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1038,8 +1216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] -= amp_sv[0]; @@ -1051,8 +1232,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; @@ -1064,8 +1248,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -1077,8 +1264,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -1090,8 +1280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1104,8 +1297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1143,8 +1339,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1157,8 +1356,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1173,8 +1375,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1187,8 +1392,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1201,8 +1409,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1217,8 +1428,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1231,8 +1445,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1247,8 +1464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1263,8 +1483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1317,8 +1540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1337,8 +1563,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1357,8 +1586,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -1373,8 +1605,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1387,8 +1622,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -1403,8 +1641,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1418,8 +1659,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1432,8 +1676,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1448,8 +1695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1462,8 +1712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1476,8 +1729,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1492,8 +1748,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1506,8 +1765,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1522,8 +1784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -1538,8 +1803,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1592,8 +1860,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1612,8 +1883,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1632,8 +1906,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -1648,8 +1925,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1662,8 +1942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1678,8 +1961,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[79] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1692,8 +1978,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -1705,8 +1994,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -1718,8 +2010,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -1731,8 +2026,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -1744,8 +2042,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1758,8 +2059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1774,8 +2078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -1787,8 +2094,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -1800,8 +2110,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -1813,8 +2126,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[89] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -1826,8 +2142,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1840,8 +2159,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1890,8 +2212,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1910,8 +2235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1930,8 +2258,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -1946,8 +2277,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1960,8 +2294,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1976,8 +2313,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2024,8 +2364,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2044,8 +2387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2064,8 +2410,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -2080,8 +2429,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2094,8 +2446,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2110,8 +2465,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2158,8 +2516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2178,8 +2539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2198,8 +2562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -2211,8 +2578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -2224,8 +2594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -2237,8 +2610,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -2877,9 +3253,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -2955,8 +3330,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3018,25 +3392,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3081,16 +3465,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3158,6 +3570,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3167,6 +3580,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3178,8 +3593,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3207,7 +3624,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3223,7 +3639,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3237,11 +3653,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3253,6 +3672,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3261,9 +3681,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3274,9 +3695,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3314,40 +3741,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3362,7 +3755,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3410,82 +3804,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3510,13 +3919,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3524,13 +3927,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 7b57d7c763..e3c578f5e0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 691a9d08c7..2cc4b19c4e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004384040832519531  +DEBUG: model prefixing takes 0.001790761947631836  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.118 s +1 processes with 123 diagrams generated in 0.083 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.366 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. +Generated helas calls for 1 subprocesses (123 diagrams) in 0.152 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.231 s +ALOHA: aloha creates 5 routines in 0.173 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.231 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.208s -user 0m1.150s -sys 0m0.049s -Code generation completed in 2 seconds +real 0m2.439s +user 0m0.757s +sys 0m0.131s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index d43252d697..d9dc04eb8b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -2934,9 +2995,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -3012,8 +3072,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3075,25 +3134,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3138,16 +3207,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3215,6 +3312,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3224,6 +3322,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3235,8 +3335,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3264,7 +3366,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3280,7 +3381,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3294,11 +3395,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3310,6 +3414,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3318,9 +3423,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3331,9 +3437,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3371,40 +3483,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3419,7 +3497,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3467,82 +3546,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3567,13 +3661,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3581,13 +3669,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 7b57d7c763..e3c578f5e0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5908592d13..da1647639c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0061588287353515625  +DEBUG: model prefixing takes 0.0017483234405517578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.427 s +1 processes with 1240 diagrams generated in 0.713 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 3s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h @@ -178,25 +179,25 @@ FileWriter t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.574 s -Wrote files for 2281 helas calls in 17.935 s +DEBUG: len(subproc_diagrams_for_config) =  945 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.518 s +Wrote files for 2281 helas calls in 34.890 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.379 s +ALOHA: aloha creates 5 routines in 0.190 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.232 s +ALOHA: aloha creates 10 routines in 0.181 s VVV1 VVV1 FFV1 @@ -209,32 +210,34 @@ ALOHA: aloha creates 10 routines in 0.232 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m31.040s -user 0m30.219s -sys 0m0.591s -Code generation completed in 31 seconds +real 0m46.871s +user 0m13.972s +sys 0m1.182s +Code generation completed in 46 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -255,9 +258,9 @@ Code generation completed in 31 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -284,9 +287,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 85e7f8f09c..e9c80d8364 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; jamp_sv[25] += amp_sv[0]; @@ -428,8 +492,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[24] += amp_sv[0]; @@ -515,8 +582,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; jamp_sv[27] += amp_sv[0]; @@ -543,8 +613,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; jamp_sv[26] += amp_sv[0]; @@ -629,8 +702,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 VVV1_0( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; jamp_sv[29] += amp_sv[0]; @@ -657,8 +733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; jamp_sv[28] += amp_sv[0]; @@ -981,8 +1060,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 VVV1_0( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1009,8 +1091,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1037,8 +1122,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 VVV1_0( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1123,8 +1211,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV1_0( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1151,8 +1242,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 VVV1_0( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -1179,8 +1273,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1265,8 +1362,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 VVV1_0( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1293,8 +1393,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -1321,8 +1424,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 VVV1_0( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1411,8 +1517,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1425,8 +1534,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1439,8 +1551,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV1_0( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1459,8 +1574,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -1475,8 +1593,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV1_0( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1495,8 +1616,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -1546,8 +1670,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1560,8 +1687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1574,8 +1704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -1590,8 +1723,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1604,8 +1740,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1618,8 +1757,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[54] -= amp_sv[0]; @@ -1634,8 +1776,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[49] -= amp_sv[0]; @@ -1650,8 +1795,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -1666,8 +1814,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1687,8 +1838,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1701,8 +1855,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1715,8 +1872,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 VVV1_0( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1735,8 +1895,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -1751,8 +1914,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 VVV1_0( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1771,8 +1937,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += amp_sv[0]; jamp_sv[86] -= amp_sv[0]; @@ -1822,8 +1991,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1836,8 +2008,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1850,8 +2025,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -1866,8 +2044,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1880,8 +2061,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1894,8 +2078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[78] -= amp_sv[0]; @@ -1910,8 +2097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[73] -= amp_sv[0]; @@ -1926,8 +2116,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -1942,8 +2135,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1963,8 +2159,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[57] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1977,8 +2176,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1991,8 +2193,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2011,8 +2216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += amp_sv[0]; jamp_sv[116] -= amp_sv[0]; @@ -2027,8 +2235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 VVV1_0( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2047,8 +2258,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += amp_sv[0]; jamp_sv[110] -= amp_sv[0]; @@ -2097,8 +2311,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2111,8 +2328,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2125,8 +2345,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -2141,8 +2364,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2155,8 +2381,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2169,8 +2398,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[102] -= amp_sv[0]; @@ -2185,8 +2417,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[97] -= amp_sv[0]; @@ -2201,8 +2436,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += amp_sv[0]; jamp_sv[113] -= amp_sv[0]; @@ -2217,8 +2455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 FFV1_0( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2238,8 +2479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 74 FFV1_0( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[73] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2252,8 +2496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 FFV1_0( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2266,8 +2513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2286,8 +2536,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -2302,8 +2555,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 VVV1_0( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2322,8 +2578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -2372,8 +2631,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -2388,8 +2650,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -2404,8 +2669,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2424,8 +2692,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2438,8 +2709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2452,8 +2726,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 VVV1_0( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2472,8 +2749,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -2488,8 +2768,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 VVV1_0( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2508,8 +2791,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -2558,8 +2844,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2574,8 +2863,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -2590,8 +2882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 93 FFV1_0( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[92] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2610,8 +2905,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 FFV1_0( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2624,8 +2922,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 FFV1_0( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2638,8 +2939,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 VVV1_0( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2658,8 +2962,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -2674,8 +2981,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 VVV1_0( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2694,8 +3004,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -2744,8 +3057,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 FFV1_0( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -2760,8 +3076,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 FFV1_0( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -2776,8 +3095,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2796,8 +3118,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -2812,8 +3137,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2832,8 +3160,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -2848,8 +3179,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 107 FFV1_0( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[106] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2868,8 +3202,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 FFV1_0( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2888,8 +3225,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 FFV1_0( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2908,8 +3248,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -2924,8 +3267,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2944,8 +3290,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -2960,8 +3309,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2980,8 +3332,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 114 FFV1_0( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[113] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3000,8 +3355,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 115 FFV1_0( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[114] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3020,8 +3378,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 116 FFV1_0( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[115] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -3036,8 +3397,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 117 FFV1_0( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[116] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3056,8 +3420,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 118 FFV1_0( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[117] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += amp_sv[0]; jamp_sv[95] -= amp_sv[0]; @@ -3072,8 +3439,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 119 FFV1_0( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[118] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3092,8 +3462,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 120 FFV1_0( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[119] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3112,8 +3485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 121 FFV1_0( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[120] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3203,8 +3579,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 124 FFV1_0( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[123] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -3216,8 +3595,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 125 FFV1_0( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[124] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -3230,9 +3612,12 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 126 FFV1_0( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); -#endif + if( storeChannelWeights ) + { + numerators_sv[125] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } +#endif jamp_sv[17] -= amp_sv[0]; // *** DIAGRAM 127 OF 1240 *** @@ -3243,8 +3628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 127 FFV1_0( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[126] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; @@ -3256,8 +3644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 128 FFV1_0( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[127] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -3269,8 +3660,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 129 FFV1_0( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[128] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -3282,8 +3676,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 130 VVV1_0( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[129] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -3298,8 +3695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 131 FFV1_0( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[130] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3312,8 +3712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 132 FFV1_0( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[131] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3326,8 +3729,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 133 VVV1_0( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[132] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -3342,8 +3748,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 134 FFV1_0( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[133] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3356,8 +3765,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 135 FFV1_0( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[134] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3370,8 +3782,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 136 VVV1_0( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[135] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -3386,8 +3801,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 137 FFV1_0( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[136] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3400,8 +3818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 138 FFV1_0( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[137] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3438,8 +3859,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 140 VVV1_0( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[139] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3458,8 +3882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 141 VVV1_0( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[140] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3512,8 +3939,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 143 FFV1_0( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[142] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3526,8 +3956,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 144 FFV1_0( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[143] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -3542,8 +3975,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 145 FFV1_0( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[144] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3556,8 +3992,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 146 FFV1_0( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[145] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -3572,8 +4011,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 147 FFV1_0( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[146] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3586,8 +4028,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 148 VVV1_0( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[147] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -3602,8 +4047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 149 FFV1_0( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[148] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3616,8 +4064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 150 FFV1_0( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[149] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3630,8 +4081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 151 VVV1_0( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[150] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -3646,8 +4100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 152 FFV1_0( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[151] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3660,8 +4117,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 153 FFV1_0( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[152] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -3676,8 +4136,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 154 VVV1_0( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[153] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3696,8 +4159,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 155 FFV1_0( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[154] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -3713,8 +4179,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 156 VVV1_0( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[155] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3733,8 +4202,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 157 VVV1_0( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[156] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3787,8 +4259,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 159 FFV1_0( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[158] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3801,8 +4276,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 160 FFV1_0( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[159] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -3817,8 +4295,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 161 FFV1_0( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[160] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3831,8 +4312,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 162 FFV1_0( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[161] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -3847,8 +4331,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 163 FFV1_0( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[162] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3861,8 +4348,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 164 VVV1_0( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[163] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -3877,8 +4367,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 165 FFV1_0( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[164] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3891,8 +4384,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 166 FFV1_0( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[165] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3905,8 +4401,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 167 VVV1_0( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[166] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -3921,8 +4420,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 168 FFV1_0( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[167] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -3935,8 +4437,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 169 FFV1_0( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[168] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -3951,8 +4456,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 170 VVV1_0( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[169] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -3971,8 +4479,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 171 FFV1_0( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[170] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -3988,8 +4499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 172 VVV1_0( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[171] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4008,8 +4522,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 173 VVV1_0( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[172] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4062,8 +4579,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 175 FFV1_0( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[174] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4076,8 +4596,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 176 FFV1_0( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[175] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -4092,8 +4615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 177 FFV1_0( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[176] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4106,8 +4632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 178 FFV1_0( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[177] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -4122,8 +4651,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 179 FFV1_0( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[178] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4136,8 +4668,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 180 VVV1_0( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[179] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -4152,8 +4687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 181 FFV1_0( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[180] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4166,8 +4704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 182 FFV1_0( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[181] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4180,8 +4721,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 183 VVV1_0( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[182] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -4196,8 +4740,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 184 FFV1_0( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[183] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4210,8 +4757,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 185 FFV1_0( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[184] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -4226,8 +4776,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 186 VVV1_0( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[185] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4246,8 +4799,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 187 FFV1_0( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[186] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -4262,8 +4818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 188 FFV1_0( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[187] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -4275,8 +4834,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 189 FFV1_0( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[188] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -4288,8 +4850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 190 FFV1_0( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[189] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -4301,8 +4866,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 191 FFV1_0( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[190] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -4314,8 +4882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 192 FFV1_0( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[191] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -4327,8 +4898,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 193 FFV1_0( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[192] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -4340,8 +4914,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 194 FFV1_0( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[193] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4354,8 +4931,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 195 VVV1_0( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[194] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -4370,8 +4950,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 196 FFV1_0( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[195] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4384,8 +4967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 197 FFV1_0( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[196] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -4397,8 +4983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 198 FFV1_0( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[197] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -4410,8 +4999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 199 FFV1_0( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[198] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -4423,8 +5015,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 200 FFV1_0( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[199] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -4436,8 +5031,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 201 FFV1_0( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[200] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -4449,8 +5047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 202 FFV1_0( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[201] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -4462,8 +5063,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 203 FFV1_0( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[202] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4476,8 +5080,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 204 VVV1_0( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[203] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -4492,8 +5099,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 205 FFV1_0( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[204] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4506,8 +5116,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 206 FFV1_0( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[205] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -4519,8 +5132,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 207 FFV1_0( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[206] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -4532,8 +5148,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 208 FFV1_0( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[207] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -4545,8 +5164,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 209 FFV1_0( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[208] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -4558,8 +5180,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 210 FFV1_0( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[209] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -4571,8 +5196,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 211 FFV1_0( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[210] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -4584,8 +5212,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 212 FFV1_0( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[211] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4598,8 +5229,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 213 VVV1_0( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[212] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4614,8 +5248,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 214 FFV1_0( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[213] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4628,8 +5265,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 215 FFV1_0( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[214] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4642,8 +5282,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 216 FFV1_0( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[215] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4658,8 +5301,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 217 VVV1_0( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[216] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4678,8 +5324,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 218 VVV1_0( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[217] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4732,8 +5381,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 220 FFV1_0( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[219] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -4748,8 +5400,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 221 FFV1_0( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[220] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4762,8 +5417,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 222 FFV1_0( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[221] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4776,8 +5434,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 223 FFV1_0( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[222] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -4792,8 +5453,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 224 VVV1_0( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[223] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4812,8 +5476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 225 VVV1_0( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[224] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4866,8 +5533,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 227 FFV1_0( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[226] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -4882,8 +5552,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 228 FFV1_0( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[227] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4896,8 +5569,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 229 FFV1_0( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[228] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -4910,8 +5586,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 230 FFV1_0( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[229] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -4926,8 +5605,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 231 VVV1_0( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[230] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -4946,8 +5628,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 232 VVV1_0( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[231] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5000,8 +5685,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 234 FFV1_0( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[233] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -5016,8 +5704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 235 FFV1_0( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[234] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5329,8 +6020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 247 FFV1_0( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[246] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= amp_sv[0]; @@ -5342,8 +6036,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 248 FFV1_0( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[247] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] -= amp_sv[0]; @@ -5356,8 +6053,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 249 FFV1_0( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[248] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= amp_sv[0]; @@ -5369,8 +6069,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 250 FFV1_0( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[249] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= amp_sv[0]; @@ -5382,8 +6085,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 251 FFV1_0( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[250] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= amp_sv[0]; @@ -5395,8 +6101,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 252 FFV1_0( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[251] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= amp_sv[0]; @@ -5408,8 +6117,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 253 VVV1_0( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[252] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -5424,8 +6136,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 254 FFV1_0( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[253] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5438,8 +6153,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 255 FFV1_0( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[254] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5452,8 +6170,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 256 VVV1_0( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[255] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -5468,8 +6189,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 257 FFV1_0( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[256] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5482,8 +6206,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 258 FFV1_0( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[257] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5496,8 +6223,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 259 VVV1_0( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[258] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -5512,8 +6242,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 260 FFV1_0( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[259] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5526,8 +6259,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 261 FFV1_0( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[260] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5562,8 +6298,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 263 VVV1_0( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[262] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5582,8 +6321,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 264 VVV1_0( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[263] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5636,8 +6378,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 266 FFV1_0( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[265] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5650,8 +6395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 267 FFV1_0( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[266] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -5666,8 +6414,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 268 FFV1_0( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[267] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5680,8 +6431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 269 FFV1_0( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[268] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -5696,8 +6450,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 270 FFV1_0( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[269] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5710,8 +6467,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 271 VVV1_0( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[270] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += amp_sv[0]; jamp_sv[87] -= amp_sv[0]; @@ -5726,8 +6486,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 272 FFV1_0( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[271] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5740,8 +6503,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 273 FFV1_0( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[272] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5754,8 +6520,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 274 VVV1_0( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[273] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += amp_sv[0]; jamp_sv[111] -= amp_sv[0]; @@ -5770,8 +6539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 275 FFV1_0( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[274] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5784,8 +6556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 276 FFV1_0( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[275] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += amp_sv[0]; jamp_sv[95] -= amp_sv[0]; @@ -5800,8 +6575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 277 VVV1_0( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[276] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5820,8 +6598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 278 FFV1_0( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[277] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -5836,8 +6617,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 279 VVV1_0( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[278] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5856,8 +6640,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 280 VVV1_0( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[279] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; @@ -5910,8 +6697,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 282 FFV1_0( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[281] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5924,8 +6714,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 283 FFV1_0( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[282] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -5940,8 +6733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 284 FFV1_0( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[283] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5954,8 +6750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 285 FFV1_0( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[284] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -5970,8 +6769,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 286 FFV1_0( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[285] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -5984,8 +6786,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 287 VVV1_0( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[286] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -6000,8 +6805,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 288 FFV1_0( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[287] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6014,8 +6822,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 289 FFV1_0( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[288] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6028,8 +6839,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 290 VVV1_0( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[289] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] += amp_sv[0]; jamp_sv[111] -= amp_sv[0]; @@ -6044,8 +6858,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 291 FFV1_0( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[290] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6058,8 +6875,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 292 FFV1_0( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[291] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -6074,8 +6894,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 293 VVV1_0( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[292] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6094,8 +6917,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 294 FFV1_0( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[293] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[39] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -6110,8 +6936,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 295 VVV1_0( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[294] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6130,8 +6959,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 296 VVV1_0( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[295] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6184,8 +7016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 298 FFV1_0( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[297] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6198,8 +7033,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 299 FFV1_0( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[298] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[47] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -6214,8 +7052,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 300 FFV1_0( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[299] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6228,8 +7069,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 301 FFV1_0( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[300] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -6244,8 +7088,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 302 FFV1_0( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[301] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6258,8 +7105,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 303 VVV1_0( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[302] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -6274,8 +7124,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 304 FFV1_0( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[303] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6288,8 +7141,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 305 FFV1_0( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[304] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6302,8 +7158,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 306 VVV1_0( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[305] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] += amp_sv[0]; jamp_sv[87] -= amp_sv[0]; @@ -6318,8 +7177,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 307 FFV1_0( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[306] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6332,8 +7194,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 308 FFV1_0( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[307] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += amp_sv[0]; jamp_sv[65] -= amp_sv[0]; @@ -6348,8 +7213,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 309 VVV1_0( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[308] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6368,8 +7236,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 310 FFV1_0( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[309] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[45] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -6384,8 +7255,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 311 FFV1_0( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[310] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] -= amp_sv[0]; @@ -6397,8 +7271,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 312 FFV1_0( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[311] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] -= amp_sv[0]; @@ -6410,8 +7287,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 313 FFV1_0( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[312] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[59] -= amp_sv[0]; @@ -6423,8 +7303,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 314 FFV1_0( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[313] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] -= amp_sv[0]; @@ -6436,8 +7319,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 315 FFV1_0( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[314] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] -= amp_sv[0]; @@ -6449,8 +7335,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 316 FFV1_0( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[315] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] -= amp_sv[0]; @@ -6462,8 +7351,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 317 FFV1_0( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[316] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6476,8 +7368,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 318 VVV1_0( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[317] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -6492,8 +7387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 319 FFV1_0( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[318] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6506,8 +7404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 320 FFV1_0( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[319] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[89] -= amp_sv[0]; @@ -6519,8 +7420,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 321 FFV1_0( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[320] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[95] -= amp_sv[0]; @@ -6532,8 +7436,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 322 FFV1_0( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[321] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[83] -= amp_sv[0]; @@ -6545,8 +7452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 323 FFV1_0( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[322] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] -= amp_sv[0]; @@ -6558,8 +7468,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 324 FFV1_0( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[323] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] -= amp_sv[0]; @@ -6571,8 +7484,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 325 FFV1_0( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[324] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[87] -= amp_sv[0]; @@ -6584,8 +7500,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 326 FFV1_0( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[325] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6598,8 +7517,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 327 VVV1_0( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[326] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -6614,8 +7536,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 328 FFV1_0( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[327] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6628,8 +7553,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 329 FFV1_0( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[328] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[113] -= amp_sv[0]; @@ -6641,8 +7569,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 330 FFV1_0( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[329] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[119] -= amp_sv[0]; @@ -6654,8 +7585,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 331 FFV1_0( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[330] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[107] -= amp_sv[0]; @@ -6667,8 +7601,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 332 FFV1_0( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[331] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[117] -= amp_sv[0]; @@ -6680,8 +7617,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 333 FFV1_0( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[332] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] -= amp_sv[0]; @@ -6693,8 +7633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 334 FFV1_0( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[333] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[111] -= amp_sv[0]; @@ -6706,8 +7649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 335 FFV1_0( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[334] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6720,8 +7666,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 336 VVV1_0( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[335] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += amp_sv[0]; jamp_sv[107] -= amp_sv[0]; @@ -6736,8 +7685,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 337 FFV1_0( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[336] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6750,8 +7702,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 338 FFV1_0( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[337] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6764,8 +7719,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 339 FFV1_0( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[338] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -6780,8 +7738,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 340 VVV1_0( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[339] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6800,8 +7761,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 341 VVV1_0( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[340] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6854,8 +7818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 343 FFV1_0( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[342] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -6870,8 +7837,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 344 FFV1_0( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[343] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6884,8 +7854,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 345 FFV1_0( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[344] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -6898,8 +7871,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 346 FFV1_0( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[345] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[71] += amp_sv[0]; jamp_sv[89] -= amp_sv[0]; @@ -6914,8 +7890,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 347 VVV1_0( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[346] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6934,8 +7913,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 348 VVV1_0( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[347] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -6988,8 +7970,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 350 FFV1_0( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[349] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[35] += amp_sv[0]; jamp_sv[45] -= amp_sv[0]; @@ -7004,8 +7989,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 351 FFV1_0( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[350] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7018,8 +8006,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 352 FFV1_0( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[351] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7032,8 +8023,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 353 FFV1_0( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[352] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[65] += amp_sv[0]; jamp_sv[71] -= amp_sv[0]; @@ -7048,8 +8042,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 354 VVV1_0( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[353] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7068,8 +8065,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 355 VVV1_0( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[354] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7122,8 +8122,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 357 FFV1_0( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[356] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[41] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -7138,8 +8141,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 358 FFV1_0( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[357] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7443,8 +8449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 370 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[369] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7457,8 +8466,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 371 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[370] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7472,8 +8484,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 372 VVV1_0( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[371] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7492,8 +8507,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 373 FFV1_0( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[372] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -7508,8 +8526,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 374 VVV1_0( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[373] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7528,8 +8549,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 375 FFV1_0( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[374] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -7580,8 +8604,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 377 FFV1_0( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[376] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7594,8 +8621,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 378 FFV1_0( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[377] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7608,8 +8638,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 379 FFV1_0( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[378] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -7624,8 +8657,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 380 FFV1_0( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[379] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7638,8 +8674,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 381 FFV1_0( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[380] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7652,8 +8691,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 382 FFV1_0( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[381] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[30] -= amp_sv[0]; @@ -7668,8 +8710,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 383 FFV1_0( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[382] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -7684,8 +8729,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 384 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[383] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -7700,8 +8748,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 385 FFV1_0( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[384] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7720,8 +8771,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 386 FFV1_0( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[385] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7734,8 +8788,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 387 FFV1_0( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[386] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7748,8 +8805,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 388 VVV1_0( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[387] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7768,8 +8828,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 389 FFV1_0( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[388] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -7784,8 +8847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 390 VVV1_0( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[389] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7804,8 +8870,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 391 FFV1_0( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[390] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -7854,8 +8923,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 393 FFV1_0( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[392] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7868,8 +8940,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 394 FFV1_0( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[393] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7882,8 +8957,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 395 FFV1_0( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[394] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -7898,8 +8976,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 396 FFV1_0( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[395] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7912,8 +8993,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 397 FFV1_0( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[396] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -7926,8 +9010,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 398 FFV1_0( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[397] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -7942,8 +9029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 399 FFV1_0( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[398] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += amp_sv[0]; jamp_sv[94] -= amp_sv[0]; @@ -7958,8 +9048,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 400 FFV1_0( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[399] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -7974,8 +9067,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 401 FFV1_0( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[400] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -7994,8 +9090,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 402 FFV1_0( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[401] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -8010,8 +9109,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 403 FFV1_0( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[402] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8030,8 +9132,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 404 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[403] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[41] -= amp_sv[0]; @@ -8046,8 +9151,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 405 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[404] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8066,8 +9174,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 406 FFV1_0( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[405] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8086,8 +9197,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 407 FFV1_0( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[406] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8164,8 +9278,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 409 VVV1_0( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[408] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8192,8 +9309,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 410 VVV1_0( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[409] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8220,8 +9340,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 411 VVV1_0( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[410] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -8248,8 +9371,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 412 FFV1_0( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[411] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8268,8 +9394,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 413 FFV1_0( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[412] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += amp_sv[0]; jamp_sv[99] -= amp_sv[0]; @@ -8284,8 +9413,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 414 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[413] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += amp_sv[0]; jamp_sv[107] -= amp_sv[0]; @@ -8300,8 +9432,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 415 FFV1_0( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[414] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8320,8 +9455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 416 FFV1_0( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[415] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8336,8 +9474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 417 FFV1_0( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[416] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -8352,8 +9493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 418 FFV1_0( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[417] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -8368,8 +9512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 419 FFV1_0( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[418] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8388,8 +9535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 420 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[419] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += amp_sv[0]; jamp_sv[47] -= amp_sv[0]; @@ -8404,8 +9554,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 421 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[420] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8424,8 +9577,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 422 FFV1_0( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[421] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8444,8 +9600,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 423 FFV1_0( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[422] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8522,8 +9681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 425 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[424] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[10] -= amp_sv[0]; @@ -8550,8 +9712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 426 VVV1_0( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[425] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -8578,8 +9743,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 427 VVV1_0( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[426] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; jamp_sv[9] += amp_sv[0]; @@ -8606,8 +9774,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 428 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[427] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8626,8 +9797,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 429 FFV1_0( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[428] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += amp_sv[0]; jamp_sv[75] -= amp_sv[0]; @@ -8642,8 +9816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 430 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[429] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += amp_sv[0]; jamp_sv[83] -= amp_sv[0]; @@ -8658,8 +9835,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 431 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[430] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; @@ -8678,8 +9858,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 432 FFV1_0( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[431] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[10] -= amp_sv[0]; @@ -8694,8 +9877,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 433 FFV1_0( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[432] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -8710,8 +9896,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 434 VVV1_0( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[433] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; jamp_sv[25] += amp_sv[0]; @@ -8738,8 +9927,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 435 VVV1_0( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[434] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; jamp_sv[24] += amp_sv[0]; @@ -8824,8 +10016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 437 VVV1_0( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[436] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -8852,8 +10047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 438 VVV1_0( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[437] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -8938,8 +10136,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 440 VVV1_0( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[439] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -8966,8 +10167,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 441 VVV1_0( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[440] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -9288,8 +10492,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 447 VVV1_0( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[446] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -9316,8 +10523,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 448 VVV1_0( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[447] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -9344,8 +10554,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 449 VVV1_0( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[448] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -9372,8 +10585,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 450 VVV1_0( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[449] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9392,8 +10608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 451 FFV1_0( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[450] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += amp_sv[0]; jamp_sv[92] -= amp_sv[0]; @@ -9408,8 +10627,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 452 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[451] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9422,8 +10644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 453 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[452] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9436,8 +10661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 454 FFV1_0( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[453] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += amp_sv[0]; jamp_sv[80] -= amp_sv[0]; @@ -9452,8 +10680,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 455 VVV1_0( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[454] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9506,8 +10737,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 457 FFV1_0( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[456] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += amp_sv[0]; jamp_sv[78] -= amp_sv[0]; @@ -9522,8 +10756,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 458 FFV1_0( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[457] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9536,8 +10773,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 459 FFV1_0( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[458] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9550,8 +10790,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 460 VVV1_0( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[459] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9570,8 +10813,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 461 FFV1_0( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[460] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] += amp_sv[0]; jamp_sv[116] -= amp_sv[0]; @@ -9586,8 +10832,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 462 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[461] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9600,8 +10849,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 463 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[462] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9614,8 +10866,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 464 FFV1_0( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[463] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += amp_sv[0]; jamp_sv[104] -= amp_sv[0]; @@ -9630,8 +10885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 465 VVV1_0( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[464] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9684,8 +10942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 467 FFV1_0( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[466] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += amp_sv[0]; jamp_sv[102] -= amp_sv[0]; @@ -9700,8 +10961,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 468 FFV1_0( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[467] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9714,8 +10978,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 469 FFV1_0( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[468] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9728,8 +10995,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 470 VVV1_0( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[469] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9748,8 +11018,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 471 FFV1_0( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[470] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -9764,8 +11037,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 472 FFV1_0( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[471] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9778,8 +11054,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 473 FFV1_0( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[472] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9792,8 +11071,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 474 FFV1_0( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[473] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -9808,8 +11090,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 475 VVV1_0( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[474] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9862,8 +11147,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 477 VVV1_0( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[476] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9882,8 +11170,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 478 FFV1_0( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[477] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[24] -= amp_sv[0]; @@ -9898,8 +11189,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 479 FFV1_0( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[478] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9912,8 +11206,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 480 FFV1_0( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[479] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -9926,8 +11223,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 481 FFV1_0( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[480] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -9942,8 +11242,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 482 VVV1_0( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[481] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; @@ -9996,8 +11299,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 484 FFV1_0( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[483] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10016,8 +11322,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 485 FFV1_0( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[484] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10036,8 +11345,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 486 FFV1_0( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[485] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10056,8 +11368,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 487 FFV1_0( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[486] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -10072,8 +11387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 488 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[487] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10092,8 +11410,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 489 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[488] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += amp_sv[0]; jamp_sv[93] -= amp_sv[0]; @@ -10235,8 +11556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 493 FFV1_0( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[492] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10249,8 +11573,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 494 FFV1_0( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[493] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10263,8 +11590,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 495 VVV1_0( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[494] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10283,8 +11613,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 496 FFV1_0( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[495] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -10299,8 +11632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 497 VVV1_0( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[496] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10319,8 +11655,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 498 FFV1_0( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[497] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -10371,8 +11710,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 500 FFV1_0( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[499] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10385,8 +11727,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 501 FFV1_0( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[500] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10399,8 +11744,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 502 FFV1_0( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[501] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -10415,8 +11763,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 503 FFV1_0( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[502] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10429,8 +11780,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 504 FFV1_0( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[503] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10443,8 +11797,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 505 FFV1_0( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[504] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += amp_sv[0]; jamp_sv[30] -= amp_sv[0]; @@ -10459,8 +11816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 506 FFV1_0( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[505] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -10475,8 +11835,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 507 FFV1_0( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[506] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -10491,8 +11854,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 508 FFV1_0( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[507] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10511,8 +11877,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 509 FFV1_0( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[508] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10525,8 +11894,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 510 FFV1_0( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[509] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10539,8 +11911,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 511 VVV1_0( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[510] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10559,8 +11934,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 512 FFV1_0( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[511] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -10575,8 +11953,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 513 VVV1_0( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[512] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10595,8 +11976,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 514 FFV1_0( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[513] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -10645,8 +12029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 516 FFV1_0( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[515] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10659,8 +12046,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 517 FFV1_0( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[516] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10673,8 +12063,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 518 FFV1_0( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[517] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -10689,8 +12082,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 519 FFV1_0( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[518] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10703,8 +12099,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 520 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[519] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -10717,8 +12116,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 521 FFV1_0( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[520] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -10733,8 +12135,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 522 FFV1_0( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[521] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += amp_sv[0]; jamp_sv[70] -= amp_sv[0]; @@ -10749,8 +12154,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 523 FFV1_0( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[522] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -10765,8 +12173,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 524 FFV1_0( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[523] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10785,8 +12196,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 525 FFV1_0( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[524] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -10801,8 +12215,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 526 FFV1_0( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[525] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10821,8 +12238,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 527 FFV1_0( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[526] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[35] -= amp_sv[0]; @@ -10837,8 +12257,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 528 FFV1_0( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[527] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10857,8 +12280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 529 FFV1_0( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[528] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10877,8 +12303,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 530 FFV1_0( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[529] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -10955,8 +12384,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 532 VVV1_0( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[531] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -10983,8 +12415,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 533 VVV1_0( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[532] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11011,8 +12446,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 534 VVV1_0( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[533] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -11039,8 +12477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 535 FFV1_0( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[534] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11059,8 +12500,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 536 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[535] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += amp_sv[0]; jamp_sv[101] -= amp_sv[0]; @@ -11075,8 +12519,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 537 FFV1_0( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[536] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += amp_sv[0]; jamp_sv[105] -= amp_sv[0]; @@ -11091,8 +12538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 538 FFV1_0( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[537] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11111,8 +12561,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 539 FFV1_0( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[538] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11127,8 +12580,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 540 FFV1_0( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[539] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -11143,8 +12599,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 541 FFV1_0( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[540] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -11159,8 +12618,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 542 FFV1_0( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[541] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11179,8 +12641,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 543 FFV1_0( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[542] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += amp_sv[0]; jamp_sv[45] -= amp_sv[0]; @@ -11195,8 +12660,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 544 FFV1_0( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[543] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11215,8 +12683,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 545 FFV1_0( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[544] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11235,8 +12706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 546 FFV1_0( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[545] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11313,8 +12787,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 548 VVV1_0( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[547] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[16] -= amp_sv[0]; @@ -11341,8 +12818,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 549 VVV1_0( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[548] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -11369,8 +12849,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 550 VVV1_0( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[549] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; jamp_sv[15] += amp_sv[0]; @@ -11397,8 +12880,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 551 FFV1_0( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[550] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11417,8 +12903,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 552 FFV1_0( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[551] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += amp_sv[0]; jamp_sv[51] -= amp_sv[0]; @@ -11433,8 +12922,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 553 FFV1_0( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[552] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += amp_sv[0]; jamp_sv[59] -= amp_sv[0]; @@ -11449,8 +12941,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 554 FFV1_0( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[553] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; @@ -11469,8 +12964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 555 FFV1_0( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[554] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[16] -= amp_sv[0]; @@ -11485,8 +12983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 556 FFV1_0( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[555] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += amp_sv[0]; jamp_sv[46] -= amp_sv[0]; @@ -11501,8 +13002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 557 VVV1_0( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[556] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; jamp_sv[27] += amp_sv[0]; @@ -11529,8 +13033,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 558 VVV1_0( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[557] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; jamp_sv[26] += amp_sv[0]; @@ -11615,8 +13122,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 560 VVV1_0( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[559] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -11643,8 +13153,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 561 VVV1_0( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[560] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -11729,8 +13242,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 563 VVV1_0( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[562] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -11757,8 +13273,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 564 VVV1_0( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[563] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -12079,8 +13598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 570 VVV1_0( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[569] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12107,8 +13629,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 571 VVV1_0( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[570] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12135,8 +13660,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 572 VVV1_0( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[571] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -12163,8 +13691,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 573 VVV1_0( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[572] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12183,8 +13714,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 574 FFV1_0( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[573] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += amp_sv[0]; jamp_sv[68] -= amp_sv[0]; @@ -12199,8 +13733,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 575 FFV1_0( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[574] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12213,8 +13750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 576 FFV1_0( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[575] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12227,8 +13767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 577 FFV1_0( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[576] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -12243,8 +13786,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 578 VVV1_0( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[577] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12297,8 +13843,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 580 FFV1_0( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[579] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += amp_sv[0]; jamp_sv[54] -= amp_sv[0]; @@ -12313,8 +13862,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 581 FFV1_0( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[580] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12327,8 +13879,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 582 FFV1_0( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[581] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12341,8 +13896,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 583 VVV1_0( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[582] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12361,8 +13919,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 584 FFV1_0( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[583] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] += amp_sv[0]; jamp_sv[110] -= amp_sv[0]; @@ -12377,8 +13938,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 585 FFV1_0( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[584] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12391,8 +13955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 586 FFV1_0( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[585] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12405,8 +13972,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 587 FFV1_0( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[586] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] += amp_sv[0]; jamp_sv[104] -= amp_sv[0]; @@ -12421,8 +13991,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 588 VVV1_0( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[587] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12475,8 +14048,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 590 FFV1_0( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[589] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -12491,8 +14067,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 591 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[590] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12505,8 +14084,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 592 FFV1_0( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[591] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12519,8 +14101,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 593 VVV1_0( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[592] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12539,8 +14124,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 594 FFV1_0( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[593] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[27] -= amp_sv[0]; @@ -12555,8 +14143,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 595 FFV1_0( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[594] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12569,8 +14160,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 596 FFV1_0( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[595] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12583,8 +14177,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 597 FFV1_0( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[596] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -12599,8 +14196,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 598 VVV1_0( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[597] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12653,8 +14253,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 600 VVV1_0( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[599] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12673,8 +14276,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 601 FFV1_0( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[600] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -12689,8 +14295,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 602 FFV1_0( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[601] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12703,8 +14312,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 603 FFV1_0( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[602] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -12717,8 +14329,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 604 FFV1_0( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[603] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -12733,8 +14348,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 605 VVV1_0( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[604] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12787,8 +14405,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 607 FFV1_0( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[606] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12807,8 +14428,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 608 FFV1_0( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[607] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12827,8 +14451,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 609 FFV1_0( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[608] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12847,8 +14474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 610 FFV1_0( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[609] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -12863,8 +14493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 611 FFV1_0( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[610] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0]; @@ -12883,8 +14516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 612 FFV1_0( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[611] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += amp_sv[0]; jamp_sv[69] -= amp_sv[0]; @@ -13026,8 +14662,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 616 FFV1_0( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[615] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13040,8 +14679,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 617 FFV1_0( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[616] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13054,8 +14696,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 618 VVV1_0( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[617] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13074,8 +14719,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 619 FFV1_0( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[618] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -13090,8 +14738,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 620 VVV1_0( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[619] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13110,8 +14761,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 621 FFV1_0( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[620] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += amp_sv[0]; jamp_sv[32] -= amp_sv[0]; @@ -13162,8 +14816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 623 FFV1_0( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[622] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13176,8 +14833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 624 FFV1_0( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[623] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13190,8 +14850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 625 FFV1_0( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[624] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -13206,8 +14869,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 626 FFV1_0( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[625] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13220,8 +14886,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 627 FFV1_0( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[626] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13234,8 +14903,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 628 FFV1_0( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[627] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -13250,8 +14922,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 629 FFV1_0( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[628] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -13266,8 +14941,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 630 FFV1_0( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[629] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[33] -= amp_sv[0]; @@ -13282,8 +14960,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 631 FFV1_0( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[630] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13302,8 +14983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 632 FFV1_0( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[631] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13316,8 +15000,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 633 FFV1_0( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[632] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13330,8 +15017,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 634 VVV1_0( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[633] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13350,8 +15040,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 635 FFV1_0( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[634] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -13366,8 +15059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 636 VVV1_0( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[635] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13386,8 +15082,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 637 FFV1_0( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[636] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -13436,8 +15135,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 639 FFV1_0( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[638] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13450,8 +15152,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 640 FFV1_0( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[639] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13464,8 +15169,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 641 FFV1_0( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[640] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -13480,8 +15188,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 642 FFV1_0( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[641] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13494,8 +15205,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 643 FFV1_0( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[642] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -13508,8 +15222,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 644 FFV1_0( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[643] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -13524,8 +15241,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 645 FFV1_0( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[644] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += amp_sv[0]; jamp_sv[64] -= amp_sv[0]; @@ -13540,8 +15260,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 646 FFV1_0( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[645] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -13556,8 +15279,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 647 FFV1_0( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[646] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13576,8 +15302,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 648 FFV1_0( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[647] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -13592,8 +15321,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 649 FFV1_0( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[648] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13612,8 +15344,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 650 FFV1_0( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[649] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[33] -= amp_sv[0]; @@ -13628,8 +15363,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 651 FFV1_0( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[650] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13648,8 +15386,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 652 FFV1_0( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[651] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13668,8 +15409,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 653 FFV1_0( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[652] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13746,8 +15490,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 655 VVV1_0( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[654] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13774,8 +15521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 656 VVV1_0( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[655] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13802,8 +15552,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 657 VVV1_0( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[656] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; jamp_sv[23] += amp_sv[0]; @@ -13830,8 +15583,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 658 FFV1_0( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[657] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13850,8 +15606,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 659 FFV1_0( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[658] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -13866,8 +15625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 660 FFV1_0( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[659] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += amp_sv[0]; jamp_sv[81] -= amp_sv[0]; @@ -13882,8 +15644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 661 FFV1_0( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[660] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13902,8 +15667,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 662 FFV1_0( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[661] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -13918,8 +15686,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 663 FFV1_0( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[662] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += amp_sv[0]; jamp_sv[34] -= amp_sv[0]; @@ -13934,8 +15705,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 664 FFV1_0( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[663] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -13950,8 +15724,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 665 FFV1_0( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[664] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -13970,8 +15747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 666 FFV1_0( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[665] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += amp_sv[0]; jamp_sv[39] -= amp_sv[0]; @@ -13986,8 +15766,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 667 FFV1_0( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[666] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14006,8 +15789,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 668 FFV1_0( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[667] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14026,8 +15812,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 669 FFV1_0( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[668] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14104,8 +15893,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 671 VVV1_0( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[670] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -14132,8 +15924,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 672 VVV1_0( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[671] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -14160,8 +15955,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 673 VVV1_0( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[672] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; jamp_sv[21] += amp_sv[0]; @@ -14188,8 +15986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 674 FFV1_0( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[673] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14208,8 +16009,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 675 FFV1_0( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[674] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -14224,8 +16028,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 676 FFV1_0( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[675] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += amp_sv[0]; jamp_sv[57] -= amp_sv[0]; @@ -14240,8 +16047,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 677 FFV1_0( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[676] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; @@ -14260,8 +16070,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 678 FFV1_0( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[677] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -14276,8 +16089,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 679 FFV1_0( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[678] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += amp_sv[0]; jamp_sv[40] -= amp_sv[0]; @@ -14292,8 +16108,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 680 VVV1_0( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[679] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; jamp_sv[29] += amp_sv[0]; @@ -14320,8 +16139,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 681 VVV1_0( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[680] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; jamp_sv[28] += amp_sv[0]; @@ -14406,8 +16228,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 683 VVV1_0( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[682] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -14434,8 +16259,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 684 VVV1_0( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[683] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -14520,8 +16348,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 686 VVV1_0( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[685] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -14548,8 +16379,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 687 VVV1_0( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[686] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -14870,8 +16704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 693 VVV1_0( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[692] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -14898,8 +16735,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 694 VVV1_0( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[693] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -14926,8 +16766,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 695 VVV1_0( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[694] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; jamp_sv[23] += amp_sv[0]; @@ -14954,8 +16797,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 696 VVV1_0( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[695] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -14974,8 +16820,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 697 FFV1_0( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[696] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += amp_sv[0]; jamp_sv[62] -= amp_sv[0]; @@ -14990,8 +16839,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 698 FFV1_0( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[697] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15004,8 +16856,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 699 FFV1_0( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[698] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15018,8 +16873,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 700 FFV1_0( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[699] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -15034,8 +16892,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 701 VVV1_0( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[700] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15088,8 +16949,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 703 FFV1_0( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[702] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -15104,8 +16968,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 704 FFV1_0( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[703] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15118,8 +16985,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 705 FFV1_0( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[704] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15132,8 +17002,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 706 VVV1_0( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[705] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15152,8 +17025,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 707 FFV1_0( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[706] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] += amp_sv[0]; jamp_sv[86] -= amp_sv[0]; @@ -15168,8 +17044,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 708 FFV1_0( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[707] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15182,8 +17061,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 709 FFV1_0( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[708] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15196,8 +17078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 710 FFV1_0( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[709] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] += amp_sv[0]; jamp_sv[80] -= amp_sv[0]; @@ -15212,8 +17097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 711 VVV1_0( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[710] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15266,8 +17154,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 713 FFV1_0( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[712] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -15282,8 +17173,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 714 FFV1_0( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[713] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15296,8 +17190,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 715 FFV1_0( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[714] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15310,8 +17207,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 716 VVV1_0( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[715] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15330,8 +17230,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 717 FFV1_0( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[716] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -15346,8 +17249,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 718 FFV1_0( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[717] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15360,8 +17266,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 719 FFV1_0( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[718] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15374,8 +17283,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 720 FFV1_0( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[719] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -15390,8 +17302,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 721 VVV1_0( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[720] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15444,8 +17359,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 723 VVV1_0( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[722] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15464,8 +17382,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 724 FFV1_0( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[723] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -15480,8 +17401,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 725 FFV1_0( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[724] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15494,8 +17418,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 726 FFV1_0( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[725] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15508,8 +17435,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 727 FFV1_0( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[726] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -15524,8 +17454,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 728 VVV1_0( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[727] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15578,8 +17511,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 730 FFV1_0( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[729] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15598,8 +17534,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 731 FFV1_0( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[730] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15618,8 +17557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 732 FFV1_0( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[731] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15638,8 +17580,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 733 FFV1_0( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[732] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -15654,8 +17599,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 734 FFV1_0( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[733] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0]; @@ -15674,8 +17622,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 735 FFV1_0( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[734] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += amp_sv[0]; jamp_sv[63] -= amp_sv[0]; @@ -15816,8 +17767,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 739 FFV1_0( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[738] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[29] -= amp_sv[0]; @@ -15829,8 +17783,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 740 FFV1_0( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[739] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] -= amp_sv[0]; @@ -15842,8 +17799,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 741 FFV1_0( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[740] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] -= amp_sv[0]; @@ -15855,8 +17815,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 742 FFV1_0( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[741] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] -= amp_sv[0]; @@ -15868,8 +17831,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 743 FFV1_0( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[742] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] -= amp_sv[0]; @@ -15881,8 +17847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 744 FFV1_0( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[743] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] -= amp_sv[0]; @@ -15894,8 +17863,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 745 FFV1_0( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[744] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15908,8 +17880,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 746 FFV1_0( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[745] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -15922,8 +17897,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 747 FFV1_0( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[746] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += amp_sv[0]; jamp_sv[29] -= amp_sv[0]; @@ -15938,8 +17916,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 748 FFV1_0( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[747] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] -= amp_sv[0]; @@ -15951,8 +17932,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 749 FFV1_0( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[748] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] -= amp_sv[0]; @@ -15964,8 +17948,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 750 FFV1_0( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[749] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] -= amp_sv[0]; @@ -15977,8 +17964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 751 FFV1_0( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[750] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] -= amp_sv[0]; @@ -15990,8 +17980,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 752 FFV1_0( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[751] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] -= amp_sv[0]; @@ -16003,8 +17996,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 753 FFV1_0( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[752] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= amp_sv[0]; @@ -16016,8 +18012,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 754 FFV1_0( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[753] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16030,8 +18029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 755 FFV1_0( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[754] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16044,8 +18046,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 756 FFV1_0( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[755] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[28] -= amp_sv[0]; @@ -16060,8 +18065,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 757 FFV1_0( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[756] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] -= amp_sv[0]; @@ -16073,8 +18081,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 758 FFV1_0( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[757] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] -= amp_sv[0]; @@ -16086,8 +18097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 759 FFV1_0( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[758] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] -= amp_sv[0]; @@ -16099,8 +18113,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 760 FFV1_0( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[759] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] -= amp_sv[0]; @@ -16112,8 +18129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 761 FFV1_0( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[760] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= amp_sv[0]; @@ -16125,8 +18145,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 762 FFV1_0( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[761] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= amp_sv[0]; @@ -16138,8 +18161,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 763 FFV1_0( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[762] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16152,8 +18178,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 764 FFV1_0( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[763] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16166,8 +18195,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 765 FFV1_0( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[764] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16182,8 +18214,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 766 FFV1_0( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[765] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16196,8 +18231,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 767 FFV1_0( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[766] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16212,8 +18250,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 768 VVV1_0( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[767] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16232,8 +18273,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 769 FFV1_0( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[768] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[43] -= amp_sv[0]; @@ -16248,8 +18292,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 770 VVV1_0( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[769] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16268,8 +18315,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 771 FFV1_0( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[770] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16318,8 +18368,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 773 FFV1_0( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[772] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16332,8 +18385,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 774 FFV1_0( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[773] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += amp_sv[0]; jamp_sv[26] -= amp_sv[0]; @@ -16348,8 +18404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 775 VVV1_0( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[774] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16368,8 +18427,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 776 FFV1_0( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[775] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -16384,8 +18446,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 777 VVV1_0( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[776] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16404,8 +18469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 778 FFV1_0( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[777] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16454,8 +18522,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 780 FFV1_0( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[779] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16468,8 +18539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 781 FFV1_0( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[780] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += amp_sv[0]; jamp_sv[25] -= amp_sv[0]; @@ -16484,8 +18558,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 782 VVV1_0( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[781] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16504,8 +18581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 783 FFV1_0( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[782] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -16520,8 +18600,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 784 VVV1_0( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[783] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16540,8 +18623,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 785 FFV1_0( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[784] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16648,8 +18734,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 789 FFV1_0( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[788] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] -= amp_sv[0]; @@ -16661,8 +18750,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 790 FFV1_0( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[789] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] -= amp_sv[0]; @@ -16674,8 +18766,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 791 FFV1_0( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[790] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[53] -= amp_sv[0]; @@ -16687,8 +18782,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 792 FFV1_0( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[791] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= amp_sv[0]; @@ -16700,8 +18798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 793 FFV1_0( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[792] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] -= amp_sv[0]; @@ -16713,8 +18814,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 794 FFV1_0( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[793] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] -= amp_sv[0]; @@ -16726,8 +18830,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 795 FFV1_0( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[794] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16740,8 +18847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 796 FFV1_0( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[795] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16754,8 +18864,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 797 FFV1_0( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[796] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] += amp_sv[0]; jamp_sv[53] -= amp_sv[0]; @@ -16770,8 +18883,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 798 FFV1_0( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[797] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] -= amp_sv[0]; @@ -16783,8 +18899,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 799 FFV1_0( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[798] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] -= amp_sv[0]; @@ -16796,8 +18915,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 800 FFV1_0( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[799] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[77] -= amp_sv[0]; @@ -16809,8 +18931,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 801 FFV1_0( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[800] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= amp_sv[0]; @@ -16822,8 +18947,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 802 FFV1_0( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[801] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] -= amp_sv[0]; @@ -16835,8 +18963,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 803 FFV1_0( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[802] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[85] -= amp_sv[0]; @@ -16848,8 +18979,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 804 FFV1_0( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[803] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16862,8 +18996,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 805 FFV1_0( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[804] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16876,8 +19013,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 806 FFV1_0( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[805] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] += amp_sv[0]; jamp_sv[77] -= amp_sv[0]; @@ -16892,8 +19032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 807 FFV1_0( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[806] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] -= amp_sv[0]; @@ -16905,8 +19048,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 808 FFV1_0( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[807] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[118] -= amp_sv[0]; @@ -16918,8 +19064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 809 FFV1_0( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[808] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[101] -= amp_sv[0]; @@ -16931,8 +19080,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 810 FFV1_0( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[809] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= amp_sv[0]; @@ -16944,8 +19096,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 811 FFV1_0( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[810] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[115] -= amp_sv[0]; @@ -16957,8 +19112,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 812 FFV1_0( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[811] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[109] -= amp_sv[0]; @@ -16970,8 +19128,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 813 FFV1_0( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[812] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16984,8 +19145,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 814 FFV1_0( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[813] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -16998,8 +19162,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 815 FFV1_0( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[814] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] += amp_sv[0]; jamp_sv[101] -= amp_sv[0]; @@ -17014,8 +19181,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 816 FFV1_0( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[815] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17028,8 +19198,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 817 FFV1_0( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[816] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[88] -= amp_sv[0]; @@ -17044,8 +19217,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 818 VVV1_0( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[817] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17064,8 +19240,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 819 FFV1_0( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[818] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -17080,8 +19259,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 820 VVV1_0( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[819] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17100,8 +19282,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 821 FFV1_0( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[820] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17148,8 +19333,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 823 FFV1_0( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[822] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17162,8 +19350,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 824 FFV1_0( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[823] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[70] += amp_sv[0]; jamp_sv[88] -= amp_sv[0]; @@ -17178,8 +19369,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 825 VVV1_0( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[824] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17198,8 +19392,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 826 FFV1_0( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[825] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -17214,8 +19411,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 827 VVV1_0( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[826] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17234,8 +19434,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 828 FFV1_0( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[827] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17282,8 +19485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 830 FFV1_0( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[829] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17296,8 +19502,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 831 FFV1_0( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[830] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[64] += amp_sv[0]; jamp_sv[70] -= amp_sv[0]; @@ -17312,8 +19521,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 832 VVV1_0( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[831] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17332,8 +19544,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 833 FFV1_0( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[832] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -17348,8 +19563,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 834 VVV1_0( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[833] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17368,8 +19586,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 835 FFV1_0( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[834] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -17472,8 +19693,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 839 VVV1_0( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[838] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; jamp_sv[7] += amp_sv[0]; @@ -17500,8 +19724,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 840 VVV1_0( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[839] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[6] += amp_sv[0]; @@ -17586,8 +19813,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 842 VVV1_0( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[841] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -17614,8 +19844,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 843 VVV1_0( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[842] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -17700,8 +19933,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 845 VVV1_0( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[844] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -17728,8 +19964,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 846 VVV1_0( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[845] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -18054,8 +20293,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 852 VVV1_0( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[851] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -18082,8 +20324,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 853 VVV1_0( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[852] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -18110,8 +20355,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 854 VVV1_0( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[853] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; jamp_sv[17] += amp_sv[0]; @@ -18138,8 +20386,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 855 VVV1_0( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[854] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18158,8 +20409,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 856 FFV1_0( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[855] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[91] -= amp_sv[0]; @@ -18174,8 +20428,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 857 FFV1_0( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[856] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18188,8 +20445,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 858 FFV1_0( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[857] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -18204,8 +20464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 859 FFV1_0( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[858] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18218,8 +20481,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 860 VVV1_0( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[859] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18272,8 +20538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 862 FFV1_0( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[861] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -18288,8 +20557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 863 FFV1_0( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[862] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18302,8 +20574,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 864 FFV1_0( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[863] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18316,8 +20591,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 865 VVV1_0( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[864] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18336,8 +20614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 866 FFV1_0( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[865] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += amp_sv[0]; jamp_sv[115] -= amp_sv[0]; @@ -18352,8 +20633,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 867 FFV1_0( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[866] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18366,8 +20650,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 868 FFV1_0( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[867] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -18382,8 +20669,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 869 FFV1_0( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[868] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18396,8 +20686,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 870 VVV1_0( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[869] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18450,8 +20743,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 872 FFV1_0( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[871] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -18466,8 +20762,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 873 FFV1_0( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[872] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18480,8 +20779,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 874 FFV1_0( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[873] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18494,8 +20796,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 875 VVV1_0( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[874] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18514,8 +20819,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 876 FFV1_0( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[875] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -18530,8 +20838,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 877 FFV1_0( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[876] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18544,8 +20855,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 878 FFV1_0( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[877] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[58] -= amp_sv[0]; @@ -18560,8 +20874,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 879 FFV1_0( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[878] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18574,8 +20891,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 880 VVV1_0( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[879] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18628,8 +20948,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 882 VVV1_0( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[881] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18648,8 +20971,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 883 FFV1_0( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[882] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -18664,8 +20990,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 884 FFV1_0( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[883] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18678,8 +21007,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 885 FFV1_0( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[884] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -18694,8 +21026,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 886 FFV1_0( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[885] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -18708,8 +21043,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 887 VVV1_0( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[886] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18762,8 +21100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 889 FFV1_0( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[888] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18782,8 +21123,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 890 FFV1_0( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[889] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18802,8 +21146,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 891 FFV1_0( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[890] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18822,8 +21169,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 892 FFV1_0( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[891] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; @@ -18842,8 +21192,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 893 FFV1_0( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[892] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[30] += amp_sv[0]; jamp_sv[31] -= amp_sv[0]; @@ -18858,8 +21211,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 894 FFV1_0( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[893] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += amp_sv[0]; jamp_sv[91] -= amp_sv[0]; @@ -18874,8 +21230,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 895 VVV1_0( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[894] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; jamp_sv[13] += amp_sv[0]; @@ -18902,8 +21261,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 896 VVV1_0( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[895] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; jamp_sv[12] += amp_sv[0]; @@ -18988,8 +21350,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 898 VVV1_0( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[897] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19016,8 +21381,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 899 VVV1_0( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[898] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19102,8 +21470,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 901 VVV1_0( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[900] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -19130,8 +21501,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 902 VVV1_0( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[901] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -19454,8 +21828,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 908 VVV1_0( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[907] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -19482,8 +21859,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 909 VVV1_0( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[908] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -19510,8 +21890,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 910 VVV1_0( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[909] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; jamp_sv[11] += amp_sv[0]; @@ -19538,8 +21921,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 911 VVV1_0( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[910] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19558,8 +21944,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 912 FFV1_0( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[911] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[67] -= amp_sv[0]; @@ -19574,8 +21963,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 913 FFV1_0( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[912] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19588,8 +21980,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 914 FFV1_0( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[913] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -19604,8 +21999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 915 FFV1_0( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[914] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19618,8 +22016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 916 VVV1_0( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[915] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; @@ -19672,8 +22073,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 918 FFV1_0( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[917] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -19688,8 +22092,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 919 FFV1_0( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[918] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19702,8 +22109,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 920 FFV1_0( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[919] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19716,8 +22126,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 921 VVV1_0( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[920] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19736,8 +22149,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 922 FFV1_0( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[921] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += amp_sv[0]; jamp_sv[109] -= amp_sv[0]; @@ -19752,8 +22168,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 923 FFV1_0( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[922] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19766,8 +22185,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 924 FFV1_0( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[923] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[98] -= amp_sv[0]; @@ -19782,8 +22204,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 925 FFV1_0( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[924] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19796,8 +22221,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 926 VVV1_0( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[925] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0]; @@ -19850,8 +22278,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 928 FFV1_0( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[927] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += amp_sv[0]; jamp_sv[100] -= amp_sv[0]; @@ -19866,8 +22297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 929 FFV1_0( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[928] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19880,8 +22314,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 930 FFV1_0( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[929] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19894,8 +22331,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 931 VVV1_0( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[930] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19914,8 +22354,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 932 FFV1_0( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[931] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -19930,8 +22373,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 933 FFV1_0( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[932] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19944,8 +22390,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 934 FFV1_0( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[933] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[82] -= amp_sv[0]; @@ -19960,8 +22409,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 935 FFV1_0( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[934] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -19974,8 +22426,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 936 VVV1_0( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[935] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20028,8 +22483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 938 VVV1_0( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[937] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20048,8 +22506,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 939 FFV1_0( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[938] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[12] -= amp_sv[0]; @@ -20064,8 +22525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 940 FFV1_0( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[939] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20078,8 +22542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 941 FFV1_0( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[940] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[38] += amp_sv[0]; jamp_sv[56] -= amp_sv[0]; @@ -20094,8 +22561,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 942 FFV1_0( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[941] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20108,8 +22578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 943 VVV1_0( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[942] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20162,8 +22635,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 945 FFV1_0( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[944] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20182,8 +22658,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 946 FFV1_0( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[945] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20202,8 +22681,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 947 FFV1_0( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[946] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20222,8 +22704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 948 FFV1_0( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[947] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; @@ -20242,8 +22727,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 949 FFV1_0( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[948] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[36] += amp_sv[0]; jamp_sv[37] -= amp_sv[0]; @@ -20258,8 +22746,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 950 FFV1_0( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[949] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += amp_sv[0]; jamp_sv[67] -= amp_sv[0]; @@ -20274,8 +22765,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 951 VVV1_0( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[950] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; jamp_sv[19] += amp_sv[0]; @@ -20302,8 +22796,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 952 VVV1_0( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[951] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; jamp_sv[18] += amp_sv[0]; @@ -20388,8 +22885,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 954 VVV1_0( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[953] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20416,8 +22916,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 955 VVV1_0( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[954] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20502,8 +23005,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 957 VVV1_0( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[956] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -20530,8 +23036,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 958 VVV1_0( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[957] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -20852,8 +23361,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 964 VVV1_0( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[963] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -20880,8 +23392,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 965 VVV1_0( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[964] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -20908,8 +23423,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 966 VVV1_0( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[965] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; jamp_sv[9] += amp_sv[0]; @@ -20936,8 +23454,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 967 VVV1_0( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[966] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20956,8 +23477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 968 FFV1_0( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[967] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -20972,8 +23496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 969 FFV1_0( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[968] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -20986,8 +23513,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 970 FFV1_0( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[969] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[50] -= amp_sv[0]; @@ -21002,8 +23532,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 971 FFV1_0( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[970] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21016,8 +23549,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 972 VVV1_0( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[971] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21070,8 +23606,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 974 FFV1_0( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[973] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += amp_sv[0]; jamp_sv[52] -= amp_sv[0]; @@ -21086,8 +23625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 975 FFV1_0( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[974] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21100,8 +23642,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 976 FFV1_0( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[975] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21114,8 +23659,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 977 VVV1_0( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[976] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21134,8 +23682,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 978 FFV1_0( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[977] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += amp_sv[0]; jamp_sv[85] -= amp_sv[0]; @@ -21150,8 +23701,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 979 FFV1_0( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[978] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21164,8 +23718,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 980 FFV1_0( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[979] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[74] -= amp_sv[0]; @@ -21180,8 +23737,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 981 FFV1_0( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[980] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21194,8 +23754,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 982 VVV1_0( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[981] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21248,8 +23811,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 984 FFV1_0( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[983] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += amp_sv[0]; jamp_sv[76] -= amp_sv[0]; @@ -21264,8 +23830,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 985 FFV1_0( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[984] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21278,8 +23847,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 986 FFV1_0( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[985] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21292,8 +23864,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 987 VVV1_0( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[986] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21312,8 +23887,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 988 FFV1_0( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[987] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -21328,8 +23906,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 989 FFV1_0( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[988] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21342,8 +23923,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 990 FFV1_0( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[989] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[46] += amp_sv[0]; jamp_sv[82] -= amp_sv[0]; @@ -21358,8 +23942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 991 FFV1_0( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[990] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21372,8 +23959,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 992 VVV1_0( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[991] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21426,8 +24016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 994 VVV1_0( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[993] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21446,8 +24039,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 995 FFV1_0( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[994] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[18] -= amp_sv[0]; @@ -21462,8 +24058,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 996 FFV1_0( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[995] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21476,8 +24075,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 997 FFV1_0( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[996] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[44] += amp_sv[0]; jamp_sv[58] -= amp_sv[0]; @@ -21492,8 +24094,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 998 FFV1_0( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[997] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -21506,8 +24111,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 999 VVV1_0( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[998] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21560,8 +24168,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1001 FFV1_0( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1000] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21580,8 +24191,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1002 FFV1_0( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1001] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21600,8 +24214,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1003 FFV1_0( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1002] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21620,8 +24237,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1004 FFV1_0( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1003] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; @@ -21640,8 +24260,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1005 FFV1_0( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1004] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[42] += amp_sv[0]; jamp_sv[43] -= amp_sv[0]; @@ -21656,8 +24279,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1006 FFV1_0( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1005] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += amp_sv[0]; jamp_sv[61] -= amp_sv[0]; @@ -21672,8 +24298,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1007 VVV1_0( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1006] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -21700,8 +24329,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1008 VVV1_0( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1007] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -21786,8 +24418,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1010 VVV1_0( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1009] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -21814,8 +24449,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1011 VVV1_0( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1010] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -21900,8 +24538,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1013 VVV1_0( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1012] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -21928,8 +24569,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1014 VVV1_0( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1013] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -22192,8 +24836,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1019 VVV1_0( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1018] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -22220,8 +24867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1020 VVV1_0( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1019] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -22306,8 +24956,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1022 VVV1_0( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1021] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[21] -= amp_sv[0]; @@ -22334,8 +24987,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1023 VVV1_0( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1022] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -22420,8 +25076,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1025 VVV1_0( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1024] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -22448,8 +25107,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1026 VVV1_0( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1025] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -22710,8 +25372,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1031 VVV1_0( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1030] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -22738,8 +25403,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1032 VVV1_0( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1031] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -22824,8 +25492,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1034 VVV1_0( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1033] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += amp_sv[0]; jamp_sv[23] -= amp_sv[0]; @@ -22852,8 +25523,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1035 VVV1_0( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1034] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -22938,8 +25612,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1037 VVV1_0( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1036] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -22966,8 +25643,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1038 VVV1_0( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1037] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -23504,8 +26184,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1046 FFV1_0( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1045] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[52] -= amp_sv[0]; @@ -23517,8 +26200,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1047 FFV1_0( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1046] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[49] -= amp_sv[0]; @@ -23530,8 +26216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1048 FFV1_0( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1047] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[58] -= amp_sv[0]; @@ -23543,8 +26232,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1049 FFV1_0( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1048] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] -= amp_sv[0]; @@ -23556,8 +26248,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1050 FFV1_0( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1049] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[55] -= amp_sv[0]; @@ -23569,8 +26264,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1051 FFV1_0( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1050] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] -= amp_sv[0]; @@ -23582,8 +26280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1052 FFV1_0( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1051] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[50] -= amp_sv[0]; @@ -23595,8 +26296,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1053 FFV1_0( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1052] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= amp_sv[0]; @@ -23608,8 +26312,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1054 FFV1_0( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1053] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[56] -= amp_sv[0]; @@ -23621,8 +26328,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1055 FFV1_0( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1054] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] -= amp_sv[0]; @@ -23634,8 +26344,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1056 FFV1_0( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1055] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] -= amp_sv[0]; @@ -23647,8 +26360,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1057 FFV1_0( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1056] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] -= amp_sv[0]; @@ -23660,8 +26376,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1058 FFV1_0( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1057] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += amp_sv[0]; jamp_sv[49] -= amp_sv[0]; @@ -23676,8 +26395,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1059 FFV1_0( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1058] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23690,8 +26412,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1060 FFV1_0( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1059] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += amp_sv[0]; jamp_sv[55] -= amp_sv[0]; @@ -23706,8 +26431,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1061 VVV1_0( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1060] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0]; @@ -23726,8 +26454,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1062 FFV1_0( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1061] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23740,8 +26471,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1063 VVV1_0( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1062] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0]; @@ -23794,8 +26528,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1065 FFV1_0( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1064] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[76] -= amp_sv[0]; @@ -23807,8 +26544,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1066 FFV1_0( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1065] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[73] -= amp_sv[0]; @@ -23820,8 +26560,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1067 FFV1_0( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1066] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[82] -= amp_sv[0]; @@ -23833,8 +26576,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1068 FFV1_0( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1067] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] -= amp_sv[0]; @@ -23846,8 +26592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1069 FFV1_0( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1068] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[79] -= amp_sv[0]; @@ -23859,8 +26608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1070 FFV1_0( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1069] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] -= amp_sv[0]; @@ -23872,8 +26624,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1071 FFV1_0( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1070] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[74] -= amp_sv[0]; @@ -23885,8 +26640,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1072 FFV1_0( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1071] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= amp_sv[0]; @@ -23898,8 +26656,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1073 FFV1_0( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1072] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[80] -= amp_sv[0]; @@ -23911,8 +26672,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1074 FFV1_0( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1073] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[86] -= amp_sv[0]; @@ -23924,8 +26688,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1075 FFV1_0( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1074] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] -= amp_sv[0]; @@ -23937,8 +26704,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1076 FFV1_0( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1075] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[84] -= amp_sv[0]; @@ -23950,8 +26720,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1077 FFV1_0( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1076] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += amp_sv[0]; jamp_sv[73] -= amp_sv[0]; @@ -23966,8 +26739,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1078 FFV1_0( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1077] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -23980,8 +26756,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1079 FFV1_0( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1078] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += amp_sv[0]; jamp_sv[79] -= amp_sv[0]; @@ -23996,8 +26775,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1080 VVV1_0( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1079] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24016,8 +26798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1081 FFV1_0( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1080] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24030,8 +26815,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1082 VVV1_0( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1081] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24084,8 +26872,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1084 FFV1_0( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1083] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[100] -= amp_sv[0]; @@ -24097,8 +26888,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1085 FFV1_0( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1084] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[97] -= amp_sv[0]; @@ -24110,8 +26904,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1086 FFV1_0( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1085] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[106] -= amp_sv[0]; @@ -24123,8 +26920,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1087 FFV1_0( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1086] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[116] -= amp_sv[0]; @@ -24136,8 +26936,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1088 FFV1_0( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1087] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[103] -= amp_sv[0]; @@ -24149,8 +26952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1089 FFV1_0( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1088] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[114] -= amp_sv[0]; @@ -24162,8 +26968,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1090 FFV1_0( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1089] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[98] -= amp_sv[0]; @@ -24175,8 +26984,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1091 FFV1_0( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1090] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= amp_sv[0]; @@ -24188,8 +27000,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1092 FFV1_0( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1091] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[104] -= amp_sv[0]; @@ -24201,8 +27016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1093 FFV1_0( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1092] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[110] -= amp_sv[0]; @@ -24214,8 +27032,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1094 FFV1_0( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1093] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] -= amp_sv[0]; @@ -24227,8 +27048,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1095 FFV1_0( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1094] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[108] -= amp_sv[0]; @@ -24240,8 +27064,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1096 FFV1_0( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1095] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += amp_sv[0]; jamp_sv[97] -= amp_sv[0]; @@ -24256,8 +27083,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1097 FFV1_0( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1096] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24270,8 +27100,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1098 FFV1_0( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1097] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += amp_sv[0]; jamp_sv[103] -= amp_sv[0]; @@ -24286,8 +27119,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1099 VVV1_0( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1098] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24306,8 +27142,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1100 FFV1_0( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1099] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24320,8 +27159,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1101 VVV1_0( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24374,8 +27216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1103 FFV1_0( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[40] += amp_sv[0]; jamp_sv[46] -= amp_sv[0]; @@ -24390,8 +27235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1104 FFV1_0( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24404,8 +27252,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1105 FFV1_0( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[22] -= amp_sv[0]; @@ -24420,8 +27271,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1106 VVV1_0( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24440,8 +27294,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1107 FFV1_0( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1106] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24454,8 +27311,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1108 VVV1_0( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24508,8 +27368,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1110 FFV1_0( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[34] += amp_sv[0]; jamp_sv[44] -= amp_sv[0]; @@ -24524,8 +27387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1111 FFV1_0( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24538,8 +27404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1112 FFV1_0( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -24554,8 +27423,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1113 VVV1_0( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24574,8 +27446,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1114 FFV1_0( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1113] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24588,8 +27463,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1115 VVV1_0( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1114] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24642,8 +27520,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1117 FFV1_0( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1116] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[32] += amp_sv[0]; jamp_sv[38] -= amp_sv[0]; @@ -24658,8 +27539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1118 FFV1_0( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1117] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24672,8 +27556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1119 FFV1_0( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1118] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -24688,8 +27575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1120 VVV1_0( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1119] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; @@ -24708,8 +27598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1121 FFV1_0( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1120] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -24722,8 +27615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1122 VVV1_0( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1121] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; @@ -30448,9 +33344,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -30526,8 +33421,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -30589,25 +33483,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -30652,16 +33556,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -30729,6 +33661,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -30738,6 +33671,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -30749,8 +33684,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -30778,7 +33715,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -30794,7 +33730,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -30808,11 +33744,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -30824,6 +33763,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -30832,9 +33772,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -30845,9 +33786,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -30885,40 +33832,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -30933,7 +33846,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -30981,82 +33895,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -31081,13 +34010,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -31095,13 +34018,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 201a432a8a..ee2421cf9a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h new file mode 100644 index 0000000000..fe7af482a7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1240; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 4f7b5172f1..1240605a6d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -57,7 +58,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004235267639160156  +DEBUG: model prefixing takes 0.003571033477783203  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.490 s +1 processes with 1240 diagrams generated in 0.709 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.122 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. +Generated helas calls for 1 subprocesses (1240 diagrams) in 2.512 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.290 s +ALOHA: aloha creates 5 routines in 0.157 s VVV1 VVV1 FFV1 @@ -189,17 +190,17 @@ ALOHA: aloha creates 5 routines in 0.290 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m10.012s -user 0m9.867s -sys 0m0.109s -Code generation completed in 10 seconds +real 0m7.550s +user 0m5.121s +sys 0m0.217s +Code generation completed in 8 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 2f17add993..12c45ededb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -32338,9 +32399,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -32416,8 +32476,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -32479,25 +32538,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -32542,16 +32611,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -32619,6 +32716,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -32628,6 +32726,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -32639,8 +32739,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -32668,7 +32770,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -32684,7 +32785,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -32698,11 +32799,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -32714,6 +32818,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -32722,9 +32827,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -32735,9 +32841,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -32775,40 +32887,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -32823,7 +32901,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -32871,82 +32950,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -32971,13 +33065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -32985,13 +33073,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 201a432a8a..ee2421cf9a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h new file mode 100644 index 0000000000..fe7af482a7 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1240; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGGG_H \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 71b7095c67..f2e018386d 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004422187805175781  +DEBUG: model prefixing takes 0.0024957656860351562  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,21 +166,21 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.058 s +8 processes with 40 diagrams generated in 0.069 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -199,9 +200,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -210,50 +211,52 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.026 s -Wrote files for 32 helas calls in 0.131 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.049 s +Wrote files for 32 helas calls in 3.302 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.106 s +ALOHA: aloha creates 2 routines in 0.104 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.094 s +ALOHA: aloha creates 4 routines in 0.087 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m2.314s -user 0m1.828s -sys 0m0.404s -Code generation completed in 2 seconds +real 0m10.462s +user 0m1.475s +sys 0m0.703s +Code generation completed in 10 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -274,9 +277,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -303,9 +306,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 7d4745918b..5091f85849 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -426,8 +493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -787,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -865,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -928,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -991,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1068,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1077,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1088,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1117,7 +1234,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1133,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1147,11 +1263,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1163,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1171,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1184,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1224,40 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1272,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1320,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1420,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1434,13 +1537,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 414284b61a..9a64abfae3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -426,8 +493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -787,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -865,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -928,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -991,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1068,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1077,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1088,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1117,7 +1234,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1133,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1147,11 +1263,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1163,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1171,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1184,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1224,40 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1272,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1320,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1420,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1434,13 +1537,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index d16040de18..016603b556 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004274129867553711  +DEBUG: model prefixing takes 0.001806020736694336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.059 s +8 processes with 40 diagrams generated in 0.080 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 INFO: Processing color information for process: g u > t t~ u @1 @@ -183,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=1 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.023 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=1 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. +Generated helas calls for 2 subprocesses (10 diagrams) in 0.037 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.105 s +ALOHA: aloha creates 2 routines in 0.081 s FFV1 FFV1 FFV1 FFV1 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.535s -user 0m0.481s -sys 0m0.048s -Code generation completed in 1 seconds +real 0m2.356s +user 0m0.505s +sys 0m0.153s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index e57428e73e..0bf3c7da89 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -782,9 +843,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -860,8 +920,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -923,25 +982,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -986,16 +1055,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1063,6 +1160,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1072,6 +1170,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1083,8 +1183,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1112,7 +1214,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1128,7 +1229,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1142,11 +1243,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1158,6 +1262,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1166,9 +1271,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1179,9 +1285,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1219,40 +1331,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1267,7 +1345,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1315,82 +1394,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1415,13 +1509,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1429,13 +1517,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 57dd4fee2d..959f91ae65 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -782,9 +843,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -860,8 +920,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -923,25 +982,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -986,16 +1055,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1063,6 +1160,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1072,6 +1170,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1083,8 +1183,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1112,7 +1214,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1128,7 +1229,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1142,11 +1243,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1158,6 +1262,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1166,9 +1271,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1179,9 +1285,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1219,40 +1331,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1267,7 +1345,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1315,82 +1394,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1415,13 +1509,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1429,13 +1517,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt index faef5b2d67..11b54f703e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  @@ -129,14 +130,14 @@ output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --ve Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb INFO: remove old information in CODEGEN_mad_heft_gg_bb -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 @@ -148,55 +149,57 @@ FileWriter b b~ HIG<=1 HIW<=1 @1 INFO: Finding symmetric diagrams for subprocess group gg_bbx -DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s -Wrote files for 12 helas calls in 0.062 s +DEBUG: len(subproc_diagrams_for_config) =  4 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (4 diagrams) in 0.011 s +Wrote files for 12 helas calls in 2.377 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.193 s +ALOHA: aloha creates 4 routines in 0.164 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 8 routines in 0.178 s +ALOHA: aloha creates 8 routines in 0.125 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README Run "open index.html" to see more information about this process. quit -real 0m2.118s -user 0m1.750s -sys 0m0.364s -Code generation completed in 2 seconds +real 0m9.223s +user 0m1.254s +sys 0m0.660s +Code generation completed in 9 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -217,9 +220,9 @@ Code generation completed in 2 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -246,9 +249,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts +++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h index 7d7b960511..5e318bc0a4 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_heft.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc index fbb0c2effb..c04f054aa9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFS2_0( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 2. * amp_sv[0]; @@ -404,8 +468,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -431,8 +501,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -746,9 +819,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -824,8 +896,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -887,25 +958,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -950,16 +1031,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1027,6 +1136,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1036,6 +1146,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1047,8 +1159,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1076,7 +1190,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1092,7 +1205,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1106,11 +1219,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1122,6 +1238,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1130,9 +1247,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1143,9 +1261,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1183,40 +1307,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1231,7 +1321,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1279,82 +1370,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1379,13 +1485,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1393,13 +1493,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h index a4c60bf837..bfcb1209e9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h new file mode 100644 index 0000000000..f7dbd383b0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H +#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 4; + +} + +#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt index 5208ed190c..13878ae8fd 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt +++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,26 +49,26 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:47:55-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz -Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 -Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. +INFO: download model from https://madgraph.mi.infn.it/Downloads/models/heft.tgz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models  +--2025-12-11 12:36:44-- https://madgraph.mi.infn.it/Downloads/models/heft.tgz +Resolving madgraph.mi.infn.it (madgraph.mi.infn.it)... 192.135.21.75 +Connecting to madgraph.mi.infn.it (madgraph.mi.infn.it)|192.135.21.75|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 50876 (50K) [application/x-gzip] Saving to: ‘tmp.tgz’ - 0K .......... .......... .......... .......... ......... 100% 921K=0.05s + 0K .......... .......... .......... .......... ......... 100% 3.00M=0.02s -2025-10-22 11:47:55 (921 KB/s) - ‘tmp.tgz’ saved [50876/50876] +2025-12-11 12:36:44 (3.00 MB/s) - ‘tmp.tgz’ saved [50876/50876] heft/ heft/write_param_card.py @@ -104,7 +105,7 @@ INFO: load particles INFO: load vertices WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.004904985427856445  +DEBUG: model prefixing takes 0.0019080638885498047  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -170,49 +171,49 @@ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ generate g g > b b~ HIW<=1 INFO: Trying process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Process has 4 diagrams -1 processes with 4 diagrams generated in 0.004 s +1 processes with 4 diagrams generated in 0.006 s Total: 1 processes with 4 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. -Generated helas calls for 1 subprocesses (4 diagrams) in 0.007 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. +Generated helas calls for 1 subprocesses (4 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFS2 routines -ALOHA: aloha creates 4 routines in 0.185 s +ALOHA: aloha creates 4 routines in 0.134 s VVS3 VVV1 FFV1 FFV1 FFV1 FFS2 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h -INFO: Created file HelAmps_heft.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h +INFO: Created file HelAmps_heft.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. quit -real 0m0.821s -user 0m0.568s -sys 0m0.084s -Code generation completed in 1 seconds +real 0m2.583s +user 0m0.606s +sys 0m0.182s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h index 7d7b960511..5e318bc0a4 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_heft.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc index 8fc4cf7184..e97d656ef0 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -742,9 +803,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -820,8 +880,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -883,25 +942,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -946,16 +1015,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1023,6 +1120,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1032,6 +1130,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1043,8 +1143,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1072,7 +1174,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1088,7 +1189,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1102,11 +1203,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1118,6 +1222,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1126,9 +1231,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1139,9 +1245,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1179,40 +1291,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1227,7 +1305,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1275,82 +1354,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1375,13 +1469,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1389,13 +1477,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h index a4c60bf837..bfcb1209e9 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h new file mode 100644 index 0000000000..f7dbd383b0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_HEFT_GG_BBX_H +#define MG5_CONFIG_SIGMA_HEFT_GG_BBX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 4; + +} + +#endif // MG5_CONFIG_SIGMA_HEFT_GG_BBX_H \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt index b5ca9e6bb6..7f8baeac9e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F import model sm-no_b_mass INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004863262176513672  +DEBUG: model prefixing takes 0.0018579959869384766  INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w- INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ -4 processes with 8 diagrams generated in 0.093 s +4 processes with 8 diagrams generated in 0.100 s Total: 4 processes with 8 diagrams add process p p > t t~ w j @1 INFO: Checking for minimal orders which gives processes. @@ -222,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~ INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g -12 processes with 144 diagrams generated in 0.520 s +12 processes with 144 diagrams generated in 0.282 s Total: 16 processes with 152 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW INFO: remove old information in CODEGEN_mad_nobm_pp_ttW -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 INFO: Processing color information for process: g u > t t~ w+ d @1 @@ -270,9 +271,9 @@ FileWriter t t~ w+ d WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gd_ttxwmu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -281,9 +282,9 @@ FileWriter t t~ w- u WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxwmdx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -292,9 +293,9 @@ FileWriter t t~ w- d~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gdx_ttxwpux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -303,9 +304,9 @@ FileWriter t t~ w+ u~ WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_udx_ttxwpg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -314,9 +315,9 @@ FileWriter t t~ w+ g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P1_dux_ttxwmg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -325,9 +326,9 @@ FileWriter t t~ w- g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg -DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  12 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [model_handling.py at line 1589]  INFO: Creating files in directory P0_udx_ttxwp DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -336,9 +337,9 @@ FileWriter t t~ w+ WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  INFO: Creating files in directory P0_dux_ttxwm DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -347,21 +348,21 @@ FileWriter t t~ w- WEIGHTED<=4 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm -DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1577]  -Generated helas calls for 8 subprocesses (76 diagrams) in 0.172 s -Wrote files for 212 helas calls in 0.856 s +DEBUG: len(subproc_diagrams_for_config) =  2 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2} [model_handling.py at line 1589]  +Generated helas calls for 8 subprocesses (76 diagrams) in 0.122 s +Wrote files for 212 helas calls in 17.074 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 3 routines in 0.166 s +ALOHA: aloha creates 3 routines in 0.141 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates VVV1 set of routines with options: P0 -ALOHA: aloha creates 6 routines in 0.150 s +ALOHA: aloha creates 6 routines in 0.115 s FFV1 FFV1 FFV1 @@ -369,32 +370,34 @@ ALOHA: aloha creates 6 routines in 0.150 s FFV2 FFV2 VVV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h -INFO: Created file HelAmps_sm_no_b_mass.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h +INFO: Created file HelAmps_sm_no_b_mass.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README Run "open index.html" to see more information about this process. quit -real 0m4.809s -user 0m4.082s -sys 0m0.695s -Code generation completed in 5 seconds +real 0m25.842s +user 0m2.903s +sys 0m1.311s +Code generation completed in 26 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -415,9 +418,9 @@ Code generation completed in 5 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -444,9 +447,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts +++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h index 71a4c3f155..058c89b2f9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm_no_b_mass.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc index 0893180611..96144a4b95 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -395,8 +456,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -409,8 +473,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -759,9 +826,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -837,8 +903,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -900,25 +965,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -963,16 +1038,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1040,6 +1143,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1049,6 +1153,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1060,8 +1166,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1089,7 +1197,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1105,7 +1212,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1119,11 +1226,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1135,6 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1143,9 +1254,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1156,9 +1268,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1196,40 +1314,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1244,7 +1328,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1292,82 +1377,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1392,13 +1492,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1406,13 +1500,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h index b9c21cb625..8db125293c 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h new file mode 100644 index 0000000000..4f350b6335 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWM_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc index 2a56cf5ec4..801cc458c3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -395,8 +456,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -409,8 +473,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -759,9 +826,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -837,8 +903,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -900,25 +965,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -963,16 +1038,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1040,6 +1143,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1049,6 +1153,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1060,8 +1166,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1089,7 +1197,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1105,7 +1212,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1119,11 +1226,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1135,6 +1245,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1143,9 +1254,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1156,9 +1268,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1196,40 +1314,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1244,7 +1328,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1292,82 +1377,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1392,13 +1492,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1406,13 +1500,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h index 43024d30fd..88ccb8aebc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h new file mode 100644 index 0000000000..38d2d5ed20 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 2; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWP_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc index 6e87d2186e..733db97179 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h index b168a7dacf..c06c1088d2 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h new file mode 100644 index 0000000000..743c903011 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_DUX_TTXWMG_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc index 563e3c6ead..c3009cc038 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h index daa474c26a..649bf473b3 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h new file mode 100644 index 0000000000..0861e7eec7 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GD_TTXWMU_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc index d6a510d40d..bb7db7b6ef 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h index d4d5408ad2..5330725977 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h new file mode 100644 index 0000000000..2e039b079e --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GDX_TTXWPUX_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc index 75dc9427fe..d69812222a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h index 6aaf443f35..362cd39944 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h new file mode 100644 index 0000000000..8aa1915d04 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GU_TTXWPD_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc index 27c3656e02..c9b72712e9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h index a4f9928134..dd4e039f85 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h new file mode 100644 index 0000000000..6724b700f9 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_GUX_TTXWMDX_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc index 31f01d963a..7ad6ebf5cb 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -99,6 +100,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -240,7 +304,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -249,7 +313,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -345,7 +409,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -358,7 +423,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -367,12 +432,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -398,8 +459,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -428,8 +495,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -442,8 +512,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -457,8 +530,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -471,8 +547,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -485,8 +564,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -499,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -513,8 +598,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -527,8 +615,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -541,8 +632,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -555,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -955,9 +1052,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1033,8 +1129,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1096,25 +1191,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1159,16 +1264,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1236,6 +1369,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1245,6 +1379,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1256,8 +1392,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1285,7 +1423,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1301,7 +1438,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1315,11 +1452,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1331,6 +1471,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1339,9 +1480,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1352,9 +1494,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1392,40 +1540,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1440,7 +1554,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1488,82 +1603,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1588,13 +1718,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1602,13 +1726,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h index 7d990b3bbd..2052fbd364 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h @@ -164,6 +164,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -172,6 +173,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -188,6 +191,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -195,6 +199,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h new file mode 100644 index 0000000000..59ab03987d --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H +#define MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 12; + +} + +#endif // MG5_CONFIG_SIGMA_SM_NO_B_MASS_UDX_TTXWPG_H \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 0da34a0aa2..c814985da4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0046498775482177734  +DEBUG: model prefixing takes 0.0018219947814941406  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -167,7 +168,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.025 s +5 processes with 7 diagrams generated in 0.057 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -207,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.114 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -373,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.872 s +65 processes with 1119 diagrams generated in 0.886 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Processing color information for process: g g > t t~ g g @2 @@ -498,9 +499,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  105 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -509,9 +510,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -520,9 +521,9 @@ FileWriter t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -531,9 +532,9 @@ FileWriter t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -542,9 +543,9 @@ FileWriter t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg -DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  35 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -553,9 +554,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  15 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -564,9 +565,9 @@ FileWriter t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -575,9 +576,9 @@ FileWriter t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -586,9 +587,9 @@ FileWriter t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux -DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  14 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -597,9 +598,9 @@ FileWriter t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -608,9 +609,9 @@ FileWriter t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -619,9 +620,9 @@ FileWriter t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -630,9 +631,9 @@ FileWriter t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx -DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  7 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -641,9 +642,9 @@ FileWriter t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -652,9 +653,9 @@ FileWriter t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -663,9 +664,9 @@ FileWriter t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [model_handling.py at line 1589]  INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -674,9 +675,9 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1156]  INFO: Creating files in directory . @@ -685,25 +686,25 @@ FileWriter t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1577]  -Generated helas calls for 18 subprocesses (372 diagrams) in 1.392 s -Wrote files for 810 helas calls in 2.303 s +DEBUG: len(subproc_diagrams_for_config) =  1 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1} [model_handling.py at line 1589]  +Generated helas calls for 18 subprocesses (372 diagrams) in 0.701 s +Wrote files for 810 helas calls in 44.199 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.191 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.237 s +ALOHA: aloha creates 10 routines in 0.178 s VVV1 VVV1 FFV1 @@ -716,32 +717,34 @@ ALOHA: aloha creates 10 routines in 0.237 s VVVV3 VVVV4 VVVV4 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h -INFO: Created file HelAmps_sm.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h +INFO: Created file HelAmps_sm.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m10.952s -user 0m9.707s -sys 0m1.156s -Code generation completed in 11 seconds +real 0m57.768s +user 0m6.485s +sys 0m2.494s +Code generation completed in 58 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -762,9 +765,9 @@ Code generation completed in 11 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -791,9 +794,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 2fa0ce29e0..9bc525d8ad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_sm.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 2ffa35504b..02f5d7a8eb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -732,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -810,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -873,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -936,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1013,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1022,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1033,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1062,7 +1173,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1078,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1092,11 +1202,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1108,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1116,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1129,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,40 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1217,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1265,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1365,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1379,13 +1476,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 3100207a11..04a1595fd1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h new file mode 100644 index 0000000000..8f6a27f796 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index b1adf10a8d..791fdf32bd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -709,9 +773,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -787,8 +850,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -850,25 +912,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -913,16 +985,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -990,6 +1090,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -999,6 +1100,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1010,8 +1113,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1039,7 +1144,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1055,7 +1159,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1069,11 +1173,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1085,6 +1192,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1093,9 +1201,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1106,9 +1215,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1146,40 +1261,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1194,7 +1275,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1242,82 +1324,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1342,13 +1439,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1356,13 +1447,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index bb3daa0e4d..36998a8fa1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h new file mode 100644 index 0000000000..771b635b93 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 1; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 7f29af7755..b8f69df605 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -394,8 +455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 VVV1_0( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; jamp_sv[2] += amp_sv[0]; @@ -410,8 +474,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -424,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -439,8 +509,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -452,8 +525,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -466,8 +542,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -480,8 +559,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -493,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -507,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -520,8 +608,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -534,8 +625,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -548,8 +642,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 VVV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -564,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -577,8 +677,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -590,8 +693,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -949,9 +1055,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1027,8 +1132,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1090,25 +1194,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1153,16 +1267,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1230,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1239,6 +1382,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1250,8 +1395,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1279,7 +1426,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1295,7 +1441,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1309,11 +1455,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1325,6 +1474,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1333,9 +1483,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1346,9 +1497,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1386,40 +1543,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1434,7 +1557,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1482,82 +1606,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1582,13 +1721,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1596,13 +1729,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 1b49cac30b..1b956214b7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h new file mode 100644 index 0000000000..47044dbe6a --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 16; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index a15b72b642..7c3b3f4b4a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -426,8 +493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -787,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -865,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -928,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -991,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1068,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1077,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1088,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1117,7 +1234,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1133,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1147,11 +1263,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1163,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1171,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1184,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1224,40 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1272,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1320,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1420,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1434,13 +1537,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bd42537623..0bf2e4625f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h new file mode 100644 index 0000000000..fe66e4e760 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 2cac6d6802..36ef0f1276 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -426,8 +493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -787,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -865,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -928,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -991,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1068,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1077,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1088,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1117,7 +1234,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1133,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1147,11 +1263,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1163,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1171,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1184,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1224,40 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1272,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1320,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1420,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1434,13 +1537,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index dd4aae8a06..9191598e88 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h new file mode 100644 index 0000000000..89823b9d1d --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 75c172df70..ab7500dca5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -412,8 +476,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[1] -= 1. / 2. * amp_sv[0]; @@ -426,8 +493,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -454,8 +527,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VVV1_0( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -787,9 +863,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -865,8 +940,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -928,25 +1002,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -991,16 +1075,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1068,6 +1180,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1077,6 +1190,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1088,8 +1203,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1117,7 +1234,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1133,7 +1249,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1147,11 +1263,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1163,6 +1282,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1171,9 +1291,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1184,9 +1305,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1224,40 +1351,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1272,7 +1365,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1320,82 +1414,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1420,13 +1529,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1434,13 +1537,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index 46d25105cc..514325b407 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h new file mode 100644 index 0000000000..8754e13596 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXG_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 5; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 382d6f340c..c5593feede 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -430,8 +491,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VVV1_0( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -450,8 +514,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -470,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VVV1_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -491,8 +561,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -505,8 +578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -521,8 +597,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -535,8 +614,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -549,8 +631,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -565,8 +650,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -579,8 +667,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -593,8 +684,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -609,8 +703,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -623,8 +720,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -639,8 +739,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -655,8 +758,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += amp_sv[0]; jamp_sv[17] -= amp_sv[0]; @@ -673,8 +779,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= amp_sv[0]; @@ -686,8 +795,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= amp_sv[0]; @@ -699,8 +811,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -714,8 +829,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -730,8 +848,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -744,8 +865,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -758,8 +882,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 VVV1_0( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -774,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -788,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -802,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -815,8 +951,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= amp_sv[0]; @@ -828,8 +967,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -841,8 +983,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= amp_sv[0]; @@ -854,8 +999,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -868,8 +1016,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 VVV1_0( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[1] -= amp_sv[0]; @@ -910,8 +1061,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] -= amp_sv[0]; @@ -923,8 +1077,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= amp_sv[0]; @@ -936,8 +1093,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -950,8 +1110,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -966,8 +1129,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -980,8 +1146,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 FFV1_0( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -994,8 +1163,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 VVV1_0( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += amp_sv[0]; jamp_sv[15] -= amp_sv[0]; @@ -1010,8 +1182,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 FFV1_0( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1024,8 +1199,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1038,8 +1216,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] -= amp_sv[0]; @@ -1051,8 +1232,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[15] -= amp_sv[0]; @@ -1064,8 +1248,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[23] -= amp_sv[0]; @@ -1077,8 +1264,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[21] -= amp_sv[0]; @@ -1090,8 +1280,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1104,8 +1297,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 VVV1_0( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1143,8 +1339,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1157,8 +1356,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1173,8 +1375,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1187,8 +1392,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1201,8 +1409,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1217,8 +1428,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1231,8 +1445,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[3] -= amp_sv[0]; @@ -1247,8 +1464,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += amp_sv[0]; jamp_sv[11] -= amp_sv[0]; @@ -1263,8 +1483,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 VVV1_0( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1317,8 +1540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 VVV1_0( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1337,8 +1563,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1357,8 +1586,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] += amp_sv[0]; jamp_sv[20] -= amp_sv[0]; @@ -1373,8 +1605,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1387,8 +1622,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += amp_sv[0]; jamp_sv[6] -= amp_sv[0]; @@ -1403,8 +1641,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1418,8 +1659,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1432,8 +1676,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += amp_sv[0]; jamp_sv[8] -= amp_sv[0]; @@ -1448,8 +1695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 67 FFV1_0( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[66] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1462,8 +1712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 FFV1_0( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1476,8 +1729,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 FFV1_0( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -1492,8 +1748,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 70 FFV1_0( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[69] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1506,8 +1765,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 FFV1_0( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1522,8 +1784,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += amp_sv[0]; jamp_sv[9] -= amp_sv[0]; @@ -1538,8 +1803,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 73 VVV1_0( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[72] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1592,8 +1860,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 75 VVV1_0( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[74] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1612,8 +1883,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 76 VVV1_0( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[75] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0]; @@ -1632,8 +1906,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 77 FFV1_0( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[76] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] += amp_sv[0]; jamp_sv[14] -= amp_sv[0]; @@ -1648,8 +1925,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 78 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[77] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1662,8 +1942,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 79 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[78] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1678,8 +1961,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 80 FFV1_0( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[79] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1692,8 +1978,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 81 FFV1_0( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[80] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= amp_sv[0]; @@ -1705,8 +1994,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 82 FFV1_0( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[81] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= amp_sv[0]; @@ -1718,8 +2010,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 83 FFV1_0( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[82] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= amp_sv[0]; @@ -1731,8 +2026,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 84 FFV1_0( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[83] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= amp_sv[0]; @@ -1744,8 +2042,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 85 FFV1_0( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[84] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1758,8 +2059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 86 FFV1_0( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[85] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += amp_sv[0]; jamp_sv[7] -= amp_sv[0]; @@ -1774,8 +2078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 87 FFV1_0( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[86] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] -= amp_sv[0]; @@ -1787,8 +2094,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 88 FFV1_0( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[87] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[13] -= amp_sv[0]; @@ -1800,8 +2110,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 89 FFV1_0( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[88] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[22] -= amp_sv[0]; @@ -1813,8 +2126,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 90 FFV1_0( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[89] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[19] -= amp_sv[0]; @@ -1826,8 +2142,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 91 FFV1_0( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[90] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1840,8 +2159,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 92 FFV1_0( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[91] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += amp_sv[0]; jamp_sv[5] -= amp_sv[0]; @@ -1890,8 +2212,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 94 VVV1_0( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[93] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1910,8 +2235,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 95 VVV1_0( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[94] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1930,8 +2258,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 96 FFV1_0( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[95] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += amp_sv[0]; jamp_sv[19] -= amp_sv[0]; @@ -1946,8 +2277,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 97 FFV1_0( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[96] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -1960,8 +2294,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 98 FFV1_0( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[97] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; jamp_sv[2] -= amp_sv[0]; @@ -1976,8 +2313,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 99 FFV1_0( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[98] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2024,8 +2364,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 101 VVV1_0( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[100] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2044,8 +2387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 102 VVV1_0( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[101] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2064,8 +2410,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 103 FFV1_0( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[102] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += amp_sv[0]; jamp_sv[13] -= amp_sv[0]; @@ -2080,8 +2429,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 104 FFV1_0( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[103] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2094,8 +2446,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 105 FFV1_0( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[104] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; jamp_sv[4] -= amp_sv[0]; @@ -2110,8 +2465,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 106 FFV1_0( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[105] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2158,8 +2516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 108 VVV1_0( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[107] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -2178,8 +2539,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 109 VVV1_0( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[108] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0]; @@ -2198,8 +2562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 110 FFV1_0( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[109] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[12] -= amp_sv[0]; @@ -2211,8 +2578,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 111 FFV1_0( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[110] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[14] -= amp_sv[0]; @@ -2224,8 +2594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 112 FFV1_0( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[111] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[18] -= amp_sv[0]; @@ -2237,8 +2610,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 113 FFV1_0( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[112] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[20] -= amp_sv[0]; @@ -2877,9 +3253,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -2955,8 +3330,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -3018,25 +3392,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -3081,16 +3465,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -3158,6 +3570,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -3167,6 +3580,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -3178,8 +3593,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -3207,7 +3624,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -3223,7 +3639,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -3237,11 +3653,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -3253,6 +3672,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -3261,9 +3681,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -3274,9 +3695,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -3314,40 +3741,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -3362,7 +3755,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -3410,82 +3804,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -3510,13 +3919,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -3524,13 +3927,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 0c3370cd1c..c0d59a27ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h new file mode 100644 index 0000000000..4f4a3c3bc0 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 123; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index e5370edc0d..d133fb651d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -510,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 6. * amp_sv[0]; @@ -524,8 +609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[1] += 1. / 2. * amp_sv[0]; @@ -538,8 +626,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[8] -= 1. / 6. * amp_sv[0]; @@ -567,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -581,8 +678,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -595,8 +695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -609,8 +712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -638,8 +747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -652,8 +764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -666,8 +781,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -680,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -709,8 +833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -723,8 +850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -737,8 +867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -751,8 +884,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -779,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[2] += 1. / 2. * amp_sv[0]; @@ -807,8 +952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +969,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -835,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -863,8 +1020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1284,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1362,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1425,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1488,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1565,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1574,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1585,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1614,7 +1821,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1630,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1644,11 +1850,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1660,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1668,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1681,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1721,40 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1769,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1817,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1917,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1931,13 +2124,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index 471c526c49..abcc2d6233 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h new file mode 100644 index 0000000000..e51eb2c6c2 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H +#define MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GG_TTXUUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 1fdfcee1ed..e2ac5942a4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -510,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -524,8 +609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 6. * amp_sv[0]; @@ -538,8 +626,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -567,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[8] -= 1. / 2. * amp_sv[0]; @@ -581,8 +678,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -595,8 +695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -609,8 +712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -638,8 +747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -652,8 +764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -666,8 +781,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -680,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -709,8 +833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -723,8 +850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -737,8 +867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -751,8 +884,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -779,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -807,8 +952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +969,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -835,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 6. * amp_sv[0]; @@ -863,8 +1020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1284,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1362,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1425,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1488,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1565,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1574,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1585,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1614,7 +1821,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1630,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1644,11 +1850,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1660,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1668,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1681,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1721,40 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1769,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1817,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1917,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1931,13 +2124,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 0afe32f972..cedb7fcb32 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h new file mode 100644 index 0000000000..f7b8795d98 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GU_TTXGU_H +#define MG5_CONFIG_SIGMA_SM_GU_TTXGU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GU_TTXGU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index bc0fc369d4..58def243d0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * amp_sv[0]; jamp_sv[6] -= 1. / 6. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -510,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -524,8 +609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -538,8 +626,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[9] -= 1. / 6. * amp_sv[0]; @@ -567,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -581,8 +678,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 6. * amp_sv[0]; @@ -595,8 +695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -609,8 +712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[1] -= 1. / 6. * amp_sv[0]; @@ -638,8 +747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -652,8 +764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 6. * amp_sv[0]; @@ -666,8 +781,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 6. * amp_sv[0]; @@ -680,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[8] += 1. / 2. * amp_sv[0]; @@ -709,8 +833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -723,8 +850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * amp_sv[0]; jamp_sv[10] -= 1. / 6. * amp_sv[0]; @@ -737,8 +867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 2. * amp_sv[0]; jamp_sv[10] -= 1. / 6. * amp_sv[0]; @@ -751,8 +884,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 2. * amp_sv[0]; jamp_sv[9] -= 1. / 6. * amp_sv[0]; @@ -779,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * amp_sv[0]; jamp_sv[6] -= 1. / 6. * amp_sv[0]; @@ -807,8 +952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +969,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 6. * amp_sv[0]; jamp_sv[11] += 1. / 2. * amp_sv[0]; @@ -835,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 6. * amp_sv[0]; jamp_sv[7] += 1. / 2. * amp_sv[0]; @@ -863,8 +1020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1284,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1362,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1425,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1488,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1565,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1574,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1585,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1614,7 +1821,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1630,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1644,11 +1850,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1660,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1668,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1681,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1721,40 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1769,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1817,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1917,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1931,13 +2124,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index 949f0f0e2a..a7b234154a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h new file mode 100644 index 0000000000..94a05c400f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H +#define MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_GUX_TTXGUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index c691d758e7..cc26513453 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -103,6 +104,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -244,7 +308,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -253,7 +317,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -349,7 +413,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -362,7 +427,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,12 +436,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -402,8 +463,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -418,8 +482,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -434,8 +501,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -480,8 +556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -865,9 +947,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -943,8 +1024,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1006,25 +1086,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1069,16 +1159,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1146,6 +1264,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1155,6 +1274,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1166,8 +1287,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1195,7 +1318,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1211,7 +1333,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1225,11 +1347,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1241,6 +1366,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1249,9 +1375,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1262,9 +1389,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1302,40 +1435,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1350,7 +1449,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1398,82 +1498,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1498,13 +1613,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1512,13 +1621,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 25aacba743..23cb81ba6a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -176,6 +177,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -192,6 +195,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -199,6 +203,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h new file mode 100644 index 0000000000..9a17b225fd --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UC_TTXUC_H +#define MG5_CONFIG_SIGMA_SM_UC_TTXUC_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UC_TTXUC_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index ca438c57e9..9603b4f631 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -109,6 +110,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -250,7 +314,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -259,7 +323,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -355,7 +419,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -368,7 +433,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -377,12 +442,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -408,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -424,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -440,8 +507,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -454,8 +524,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -470,8 +543,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -486,8 +562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -502,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -871,9 +953,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -949,8 +1030,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1012,25 +1092,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1075,16 +1165,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1152,6 +1270,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1161,6 +1280,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1172,8 +1293,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1201,7 +1324,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1217,7 +1339,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1231,11 +1353,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1247,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1255,9 +1381,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1268,9 +1395,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1308,40 +1441,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1356,7 +1455,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1404,82 +1504,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1504,13 +1619,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1518,13 +1627,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index ac3df5ae1d..05b5116162 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -182,6 +183,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -198,6 +201,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -205,6 +209,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h new file mode 100644 index 0000000000..f52e249e91 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H +#define MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UCX_TTXUCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 7603295c95..a72a6225b4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -462,8 +535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -480,8 +556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -512,8 +594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -542,8 +630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -558,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -574,8 +668,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -590,8 +687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 4. * amp_sv[0]; @@ -606,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -975,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1053,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1116,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1179,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1256,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1265,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1276,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1305,7 +1449,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1321,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1335,11 +1478,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1351,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1359,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1372,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1412,40 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1460,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1508,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1608,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1622,13 +1752,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 8af0c5a78c..70a92da32a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h new file mode 100644 index 0000000000..8f10a6d734 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UU_TTXUU_H +#define MG5_CONFIG_SIGMA_SM_UU_TTXUU_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UU_TTXUU_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 77d7eddc6c..83505e62ba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -109,6 +110,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -250,7 +314,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -259,7 +323,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -355,7 +419,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -368,7 +433,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -377,12 +442,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -408,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -424,8 +488,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -440,8 +507,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -454,8 +524,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -470,8 +543,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -486,8 +562,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -502,8 +581,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -871,9 +953,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -949,8 +1030,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1012,25 +1092,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1075,16 +1165,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1152,6 +1270,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1161,6 +1280,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1172,8 +1293,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1201,7 +1324,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1217,7 +1339,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1231,11 +1353,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1247,6 +1372,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1255,9 +1381,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1268,9 +1395,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1308,40 +1441,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1356,7 +1455,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1404,82 +1504,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1504,13 +1619,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1518,13 +1627,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index f37d8b5515..b7ffff9d65 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -174,6 +174,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -182,6 +183,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -198,6 +201,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -205,6 +209,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h new file mode 100644 index 0000000000..2dfae1920f --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXCCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 4f2c72bff8..4c8f471aba 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -481,8 +557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -510,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -524,8 +609,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -538,8 +626,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 VVV1_0( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -553,8 +644,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -567,8 +661,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -581,8 +678,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] -= 1. / 2. * amp_sv[0]; jamp_sv[11] += 1. / 6. * amp_sv[0]; @@ -595,8 +695,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -609,8 +712,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 VVV1_0( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -624,8 +730,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -638,8 +747,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[8] += 1. / 6. * amp_sv[0]; @@ -652,8 +764,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 FFV1_0( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[9] -= 1. / 2. * amp_sv[0]; @@ -666,8 +781,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -680,8 +798,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV1_0( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -695,8 +816,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[8] += 1. / 6. * amp_sv[0]; @@ -709,8 +833,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -723,8 +850,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 6. * amp_sv[0]; jamp_sv[10] -= 1. / 2. * amp_sv[0]; @@ -737,8 +867,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 6. * amp_sv[0]; @@ -751,8 +884,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -765,8 +901,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -779,8 +918,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -793,8 +935,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 FFV1_0( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -807,8 +952,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -821,8 +969,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 FFV1_0( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * amp_sv[0]; jamp_sv[7] += 1. / 6. * amp_sv[0]; @@ -835,8 +986,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -849,8 +1003,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -863,8 +1020,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -899,8 +1059,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 VVV1_0( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -915,8 +1078,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 VVV1_0( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -1284,9 +1450,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1362,8 +1527,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1425,25 +1589,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1488,16 +1662,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1565,6 +1767,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1574,6 +1777,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1585,8 +1790,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1614,7 +1821,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1630,7 +1836,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1644,11 +1850,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1660,6 +1869,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1668,9 +1878,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1681,9 +1892,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1721,40 +1938,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1769,7 +1952,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1817,82 +2001,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1917,13 +2116,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1931,13 +2124,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 311a96d812..8be5530c1c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h new file mode 100644 index 0000000000..25081a00da --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 36; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXGG_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 65f0e5aaf4..25cb87562c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -462,8 +535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -480,8 +556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -512,8 +594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -542,8 +630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -558,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -574,8 +668,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -590,8 +687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -606,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 36. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -975,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1053,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1116,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1179,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1256,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1265,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1276,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1305,7 +1449,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1321,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1335,11 +1478,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1351,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1359,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1372,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1412,40 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1460,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1508,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1608,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1622,13 +1752,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 75597d043e..13ce403cae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h new file mode 100644 index 0000000000..e6c319de76 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H +#define MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UUX_TTXUUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index d938dc9999..5096994855 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -103,6 +104,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -244,7 +308,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -253,7 +317,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -349,7 +413,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -362,7 +427,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -371,12 +436,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -402,8 +463,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -418,8 +482,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -434,8 +501,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -448,8 +518,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -464,8 +537,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -480,8 +556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -865,9 +947,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -943,8 +1024,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1006,25 +1086,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1069,16 +1159,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1146,6 +1264,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1155,6 +1274,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1166,8 +1287,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1195,7 +1318,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1211,7 +1333,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1225,11 +1347,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1241,6 +1366,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1249,9 +1375,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1262,9 +1389,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1302,40 +1435,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1350,7 +1449,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1398,82 +1498,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1498,13 +1613,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1512,13 +1621,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index ebbade848b..f4abe8c1e9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -168,6 +168,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -176,6 +177,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -192,6 +195,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -199,6 +203,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h new file mode 100644 index 0000000000..7adcbeb7fc --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H +#define MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 7; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UXCX_TTXUXCX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index ef2de04fdb..44dceb663b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -101,6 +102,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -242,7 +306,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -251,7 +315,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -347,7 +411,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -360,7 +425,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -369,12 +434,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -416,8 +480,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -432,8 +499,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV1_0( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -446,8 +516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -462,8 +535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[2] -= 1. / 12. * amp_sv[0]; @@ -480,8 +556,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 FFV1_0( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -496,8 +575,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -512,8 +594,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 VVV1_0( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[1] -= 1. / 4. * amp_sv[0]; @@ -542,8 +630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 12. * amp_sv[0]; jamp_sv[2] -= 1. / 4. * amp_sv[0]; @@ -558,8 +649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -574,8 +668,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 4. * amp_sv[0]; jamp_sv[3] += 1. / 12. * amp_sv[0]; @@ -590,8 +687,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 4. * amp_sv[0]; jamp_sv[1] -= 1. / 12. * amp_sv[0]; @@ -606,8 +706,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 12. * amp_sv[0]; jamp_sv[3] += 1. / 36. * amp_sv[0]; @@ -975,9 +1078,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1053,8 +1155,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1116,25 +1217,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1179,16 +1290,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1256,6 +1395,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1265,6 +1405,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1276,8 +1418,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1305,7 +1449,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1321,7 +1464,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1335,11 +1478,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1351,6 +1497,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1359,9 +1506,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1372,9 +1520,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1412,40 +1566,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1460,7 +1580,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1508,82 +1629,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1608,13 +1744,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1622,13 +1752,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 17c9c0faf1..e4b749f215 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -166,6 +166,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -174,6 +175,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -190,6 +193,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -197,6 +201,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h new file mode 100644 index 0000000000..368fc584e6 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H +#define MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 14; + +} + +#endif // MG5_CONFIG_SIGMA_SM_UXUX_TTXUXUX_H \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt index e728335e4c..ddfce4015b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles INFO: load vertices @@ -72,7 +73,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07860422134399414  +DEBUG: model prefixing takes 0.034606218338012695  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -87,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.729 s +1 processes with 72 diagrams generated in 1.407 s Total: 1 processes with 72 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt INFO: remove old information in CODEGEN_mad_smeft_gg_tttt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 @@ -113,25 +114,25 @@ FileWriter t t~ t t~ WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx -DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (72 diagrams) in 0.132 s -Wrote files for 119 helas calls in 0.360 s +DEBUG: len(subproc_diagrams_for_config) =  70 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s +Wrote files for 119 helas calls in 3.822 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.215 s +ALOHA: aloha creates 5 routines in 0.160 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 10 routines in 0.214 s +ALOHA: aloha creates 10 routines in 0.152 s VVV5 VVV5 FFV1 @@ -141,32 +142,34 @@ ALOHA: aloha creates 10 routines in 0.214 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README Run "open index.html" to see more information about this process. quit -real 0m5.833s -user 0m5.426s -sys 0m0.391s -Code generation completed in 6 seconds +real 0m13.208s +user 0m3.391s +sys 0m0.650s +Code generation completed in 13 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -187,9 +190,9 @@ Code generation completed in 6 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -216,9 +219,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat index 7758c3603b..ee875f040f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/ident_card.dat @@ -216,17 +216,19 @@ decay 23 mdl_WZ decay 24 mdl_WW decay 25 mdl_WH decay 6 mdl_WT +mass 1 mdl_MD mass 11 mdl_Me mass 13 mdl_MMU mass 15 mdl_MTA -mass 1 mdl_MD +mass 2 mdl_MU mass 23 mdl_MZ mass 25 mdl_MH -mass 2 mdl_MU mass 3 mdl_MS mass 4 mdl_MC mass 5 mdl_MB mass 6 mdl_MT +smeft 1 mdl_cG +smeft 10 mdl_cuHRe smeft 100 mdl_ceHRe smeft 101 mdl_ceWRe smeft 102 mdl_ceBRe @@ -237,7 +239,7 @@ smeft 106 mdl_cll smeft 107 mdl_cll1 smeft 108 mdl_clj1 smeft 109 mdl_clj3 -smeft 10 mdl_cuHRe +smeft 11 mdl_ctHRe smeft 110 mdl_cQl1 smeft 111 mdl_cQl3 smeft 112 mdl_cee @@ -248,7 +250,7 @@ smeft 116 mdl_cbe smeft 117 mdl_cje smeft 118 mdl_cQe smeft 119 mdl_clu -smeft 11 mdl_ctHRe +smeft 12 mdl_cdHRe smeft 120 mdl_ctl smeft 121 mdl_cld smeft 122 mdl_cbl @@ -259,7 +261,6 @@ smeft 126 mdl_cleju1Re smeft 127 mdl_cleQt1Re smeft 128 mdl_cleju3Re smeft 129 mdl_cleQt3Re -smeft 12 mdl_cdHRe smeft 13 mdl_cbHRe smeft 14 mdl_cuGRe smeft 15 mdl_ctGRe @@ -267,7 +268,7 @@ smeft 16 mdl_cuWRe smeft 17 mdl_ctWRe smeft 18 mdl_cuBRe smeft 19 mdl_ctBRe -smeft 1 mdl_cG +smeft 2 mdl_cW smeft 20 mdl_cdGRe smeft 21 mdl_cbGRe smeft 22 mdl_cdWRe @@ -278,7 +279,7 @@ smeft 26 mdl_cHj1 smeft 27 mdl_cHQ1 smeft 28 mdl_cHj3 smeft 29 mdl_cHQ3 -smeft 2 mdl_cW +smeft 3 mdl_cH smeft 30 mdl_cHu smeft 31 mdl_cHt smeft 32 mdl_cHd @@ -289,7 +290,7 @@ smeft 36 mdl_cjj11 smeft 37 mdl_cjj18 smeft 38 mdl_cjj31 smeft 39 mdl_cjj38 -smeft 3 mdl_cH +smeft 4 mdl_cHbox smeft 40 mdl_cQj11 smeft 41 mdl_cQj18 smeft 42 mdl_cQj31 @@ -300,7 +301,7 @@ smeft 46 mdl_cuu1 smeft 47 mdl_cuu8 smeft 48 mdl_ctt smeft 49 mdl_ctu1 -smeft 4 mdl_cHbox +smeft 5 mdl_cHDD smeft 50 mdl_ctu8 smeft 51 mdl_cdd1 smeft 52 mdl_cdd8 @@ -311,7 +312,7 @@ smeft 56 mdl_cud1 smeft 57 mdl_ctb1 smeft 58 mdl_ctd1 smeft 59 mdl_cbu1 -smeft 5 mdl_cHDD +smeft 6 mdl_cHG smeft 60 mdl_cud8 smeft 61 mdl_ctb8 smeft 62 mdl_ctd8 @@ -322,7 +323,7 @@ smeft 66 mdl_cju1 smeft 67 mdl_cQu1 smeft 68 mdl_cju8 smeft 69 mdl_cQu8 -smeft 6 mdl_cHG +smeft 7 mdl_cHW smeft 70 mdl_ctj1 smeft 71 mdl_ctj8 smeft 72 mdl_cQt1 @@ -333,7 +334,7 @@ smeft 76 mdl_cQd1 smeft 77 mdl_cQd8 smeft 78 mdl_cbj1 smeft 79 mdl_cbj8 -smeft 7 mdl_cHW +smeft 8 mdl_cHB smeft 80 mdl_cQb1 smeft 81 mdl_cQb8 smeft 82 mdl_cjQtu1Re @@ -344,7 +345,7 @@ smeft 86 mdl_cjujd1Re smeft 87 mdl_cjujd8Re smeft 88 mdl_cjujd11Re smeft 89 mdl_cjujd81Re -smeft 8 mdl_cHB +smeft 9 mdl_cHWB smeft 90 mdl_cQtjd1Re smeft 91 mdl_cQtjd8Re smeft 92 mdl_cjuQb1Re @@ -355,7 +356,7 @@ smeft 96 mdl_cjtQd1Re smeft 97 mdl_cjtQd8Re smeft 98 mdl_cQtQb1Re smeft 99 mdl_cQtQb8Re -smeft 9 mdl_cHWB +smeftcpv 1 mdl_cGtil smeftcpv 10 mdl_ctWIm smeftcpv 11 mdl_cuBIm smeftcpv 12 mdl_ctBIm @@ -366,7 +367,7 @@ smeftcpv 16 mdl_cbWIm smeftcpv 17 mdl_cdBIm smeftcpv 18 mdl_cbBIm smeftcpv 19 mdl_cuHIm -smeftcpv 1 mdl_cGtil +smeftcpv 2 mdl_cWtil smeftcpv 20 mdl_ctHIm smeftcpv 21 mdl_cdHIm smeftcpv 22 mdl_cbHIm @@ -377,7 +378,7 @@ smeftcpv 26 mdl_cutbd8Im smeftcpv 27 mdl_cjQtu1Im smeftcpv 28 mdl_cjQtu8Im smeftcpv 29 mdl_cjQbd1Im -smeftcpv 2 mdl_cWtil +smeftcpv 3 mdl_cHGtil smeftcpv 30 mdl_cjQbd8Im smeftcpv 31 mdl_cjujd1Im smeftcpv 32 mdl_cjujd8Im @@ -388,7 +389,7 @@ smeftcpv 36 mdl_cQtjd8Im smeftcpv 37 mdl_cjuQb1Im smeftcpv 38 mdl_cjuQb8Im smeftcpv 39 mdl_cQujb1Im -smeftcpv 3 mdl_cHGtil +smeftcpv 4 mdl_cHWtil smeftcpv 40 mdl_cQujb8Im smeftcpv 41 mdl_cjtQd1Im smeftcpv 42 mdl_cjtQd8Im @@ -399,12 +400,11 @@ smeftcpv 46 mdl_ceWIm smeftcpv 47 mdl_ceBIm smeftcpv 48 mdl_cledjIm smeftcpv 49 mdl_clebQIm -smeftcpv 4 mdl_cHWtil +smeftcpv 5 mdl_cHBtil smeftcpv 50 mdl_cleju1Im smeftcpv 51 mdl_cleju3Im smeftcpv 52 mdl_cleQt1Im smeftcpv 53 mdl_cleQt3Im -smeftcpv 5 mdl_cHBtil smeftcpv 6 mdl_cHWBtil smeftcpv 7 mdl_cuGIm smeftcpv 8 mdl_ctGIm @@ -414,10 +414,10 @@ sminputs 1 mdl_MW sminputs 2 mdl_Gf sminputs 3 aS switches 1 mdl_linearPropCorrections +yukawa 1 mdl_ymdo yukawa 11 mdl_yme yukawa 13 mdl_ymm yukawa 15 mdl_ymtau -yukawa 1 mdl_ymdo yukawa 2 mdl_ymup yukawa 3 mdl_yms yukawa 4 mdl_ymc diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc index dab3aac603..e7bc7ae438 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/param_card.inc @@ -2,17 +2,19 @@ MDL_WW = 2.085000D+00 MDL_WH = 4.070000D-03 MDL_WT = 1.330000D+00 + MDL_MD = 4.670000D-03 MDL_ME = 5.110000D-04 MDL_MMU = 1.056600D-01 MDL_MTA = 1.777000D+00 - MDL_MD = 4.670000D-03 + MDL_MU = 2.160000D-03 MDL_MZ = 9.118760D+01 MDL_MH = 1.250900D+02 - MDL_MU = 2.160000D-03 MDL_MS = 9.300000D-02 MDL_MC = 1.270000D+00 MDL_MB = 4.180000D+00 MDL_MT = 1.727600D+02 + MDL_CG = 0.000000D+00 + MDL_CUHRE = 0.000000D+00 MDL_CEHRE = 0.000000D+00 MDL_CEWRE = 0.000000D+00 MDL_CEBRE = 0.000000D+00 @@ -23,7 +25,7 @@ MDL_CLL1 = 0.000000D+00 MDL_CLJ1 = 0.000000D+00 MDL_CLJ3 = 0.000000D+00 - MDL_CUHRE = 0.000000D+00 + MDL_CTHRE = 0.000000D+00 MDL_CQL1 = 0.000000D+00 MDL_CQL3 = 0.000000D+00 MDL_CEE = 0.000000D+00 @@ -34,7 +36,7 @@ MDL_CJE = 0.000000D+00 MDL_CQE = 0.000000D+00 MDL_CLU = 0.000000D+00 - MDL_CTHRE = 0.000000D+00 + MDL_CDHRE = 0.000000D+00 MDL_CTL = 0.000000D+00 MDL_CLD = 0.000000D+00 MDL_CBL = 0.000000D+00 @@ -45,7 +47,6 @@ MDL_CLEQT1RE = 0.000000D+00 MDL_CLEJU3RE = 0.000000D+00 MDL_CLEQT3RE = 0.000000D+00 - MDL_CDHRE = 0.000000D+00 MDL_CBHRE = 0.000000D+00 MDL_CUGRE = 0.000000D+00 MDL_CTGRE = 0.000000D+00 @@ -53,7 +54,7 @@ MDL_CTWRE = 0.000000D+00 MDL_CUBRE = 0.000000D+00 MDL_CTBRE = 0.000000D+00 - MDL_CG = 0.000000D+00 + MDL_CW = 0.000000D+00 MDL_CDGRE = 0.000000D+00 MDL_CBGRE = 0.000000D+00 MDL_CDWRE = 0.000000D+00 @@ -64,7 +65,7 @@ MDL_CHQ1 = 0.000000D+00 MDL_CHJ3 = 0.000000D+00 MDL_CHQ3 = 0.000000D+00 - MDL_CW = 0.000000D+00 + MDL_CH = 0.000000D+00 MDL_CHU = 0.000000D+00 MDL_CHT = 0.000000D+00 MDL_CHD = 0.000000D+00 @@ -75,7 +76,7 @@ MDL_CJJ18 = 0.000000D+00 MDL_CJJ31 = 0.000000D+00 MDL_CJJ38 = 0.000000D+00 - MDL_CH = 0.000000D+00 + MDL_CHBOX = 0.000000D+00 MDL_CQJ11 = 0.000000D+00 MDL_CQJ18 = 0.000000D+00 MDL_CQJ31 = 0.000000D+00 @@ -86,7 +87,7 @@ MDL_CUU8 = 0.000000D+00 MDL_CTT = 0.000000D+00 MDL_CTU1 = 0.000000D+00 - MDL_CHBOX = 0.000000D+00 + MDL_CHDD = 0.000000D+00 MDL_CTU8 = 0.000000D+00 MDL_CDD1 = 0.000000D+00 MDL_CDD8 = 0.000000D+00 @@ -97,7 +98,7 @@ MDL_CTB1 = 0.000000D+00 MDL_CTD1 = 0.000000D+00 MDL_CBU1 = 0.000000D+00 - MDL_CHDD = 0.000000D+00 + MDL_CHG = 0.000000D+00 MDL_CUD8 = 0.000000D+00 MDL_CTB8 = 0.000000D+00 MDL_CTD8 = 0.000000D+00 @@ -108,7 +109,7 @@ MDL_CQU1 = 0.000000D+00 MDL_CJU8 = 0.000000D+00 MDL_CQU8 = 0.000000D+00 - MDL_CHG = 0.000000D+00 + MDL_CHW = 0.000000D+00 MDL_CTJ1 = 0.000000D+00 MDL_CTJ8 = 0.000000D+00 MDL_CQT1 = 0.000000D+00 @@ -119,7 +120,7 @@ MDL_CQD8 = 0.000000D+00 MDL_CBJ1 = 0.000000D+00 MDL_CBJ8 = 0.000000D+00 - MDL_CHW = 0.000000D+00 + MDL_CHB = 0.000000D+00 MDL_CQB1 = 0.000000D+00 MDL_CQB8 = 0.000000D+00 MDL_CJQTU1RE = 0.000000D+00 @@ -130,7 +131,7 @@ MDL_CJUJD8RE = 0.000000D+00 MDL_CJUJD11RE = 0.000000D+00 MDL_CJUJD81RE = 0.000000D+00 - MDL_CHB = 0.000000D+00 + MDL_CHWB = 0.000000D+00 MDL_CQTJD1RE = 0.000000D+00 MDL_CQTJD8RE = 0.000000D+00 MDL_CJUQB1RE = 0.000000D+00 @@ -141,7 +142,7 @@ MDL_CJTQD8RE = 0.000000D+00 MDL_CQTQB1RE = 0.000000D+00 MDL_CQTQB8RE = 0.000000D+00 - MDL_CHWB = 0.000000D+00 + MDL_CGTIL = 0.000000D+00 MDL_CTWIM = 0.000000D+00 MDL_CUBIM = 0.000000D+00 MDL_CTBIM = 0.000000D+00 @@ -152,7 +153,7 @@ MDL_CDBIM = 0.000000D+00 MDL_CBBIM = 0.000000D+00 MDL_CUHIM = 0.000000D+00 - MDL_CGTIL = 0.000000D+00 + MDL_CWTIL = 0.000000D+00 MDL_CTHIM = 0.000000D+00 MDL_CDHIM = 0.000000D+00 MDL_CBHIM = 0.000000D+00 @@ -163,7 +164,7 @@ MDL_CJQTU1IM = 0.000000D+00 MDL_CJQTU8IM = 0.000000D+00 MDL_CJQBD1IM = 0.000000D+00 - MDL_CWTIL = 0.000000D+00 + MDL_CHGTIL = 0.000000D+00 MDL_CJQBD8IM = 0.000000D+00 MDL_CJUJD1IM = 0.000000D+00 MDL_CJUJD8IM = 0.000000D+00 @@ -174,7 +175,7 @@ MDL_CJUQB1IM = 0.000000D+00 MDL_CJUQB8IM = 0.000000D+00 MDL_CQUJB1IM = 0.000000D+00 - MDL_CHGTIL = 0.000000D+00 + MDL_CHWTIL = 0.000000D+00 MDL_CQUJB8IM = 0.000000D+00 MDL_CJTQD1IM = 0.000000D+00 MDL_CJTQD8IM = 0.000000D+00 @@ -185,12 +186,11 @@ MDL_CEBIM = 0.000000D+00 MDL_CLEDJIM = 0.000000D+00 MDL_CLEBQIM = 0.000000D+00 - MDL_CHWTIL = 0.000000D+00 + MDL_CHBTIL = 0.000000D+00 MDL_CLEJU1IM = 0.000000D+00 MDL_CLEJU3IM = 0.000000D+00 MDL_CLEQT1IM = 0.000000D+00 MDL_CLEQT3IM = 0.000000D+00 - MDL_CHBTIL = 0.000000D+00 MDL_CHWBTIL = 0.000000D+00 MDL_CUGIM = 0.000000D+00 MDL_CTGIM = 0.000000D+00 @@ -200,10 +200,10 @@ MDL_GF = 1.166379D-05 AS = 1.179000D-01 MDL_LINEARPROPCORRECTIONS = 0.000000D+00 + MDL_YMDO = 4.670000D-03 MDL_YME = 5.110000D-04 MDL_YMM = 1.056600D-01 MDL_YMTAU = 1.777000D+00 - MDL_YMDO = 4.670000D-03 MDL_YMUP = 2.160000D-03 MDL_YMS = 9.300000D-02 MDL_YMC = 1.270000D+00 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h index 24800c08c9..50496fa2bf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc index 84ba0de9b4..7344868d8d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -397,8 +458,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -413,8 +477,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -429,8 +496,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VVV5_0( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[2] -= 1. / 2. * amp_sv[0]; @@ -446,8 +516,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 FFV1_0( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -462,8 +535,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 FFV1_0( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -478,8 +554,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VVV5_0( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[3] += 1. / 2. * amp_sv[0]; @@ -494,8 +573,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 7 FFV1_0( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[6] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; @@ -510,8 +592,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 8 FFV1_0( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[7] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -526,8 +611,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 9 FFV1_0( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[8] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -542,8 +630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 10 FFV1_0( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[9] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -560,8 +651,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 11 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[10] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 6. * amp_sv[0]; @@ -574,8 +668,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 12 FFV1_0( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[11] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -589,8 +686,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 13 FFV1_0( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[12] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[5] += 1. / 2. * amp_sv[0]; @@ -603,8 +703,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 14 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[13] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[5] += 1. / 6. * amp_sv[0]; @@ -617,8 +720,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 15 FFV1_0( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[14] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[4] -= 1. / 6. * amp_sv[0]; @@ -631,8 +737,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 16 FFV1_0( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[15] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[4] -= 1. / 2. * amp_sv[0]; @@ -645,8 +754,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 17 FFV1_0( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[16] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[1] += 1. / 6. * amp_sv[0]; @@ -659,8 +771,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 18 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[17] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -673,8 +788,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 19 FFV1_0( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[18] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[1] += 1. / 2. * amp_sv[0]; @@ -687,8 +805,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 20 VVV5_0( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[19] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -703,8 +824,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 21 FFV1_0( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[20] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[8] -= 1. / 2. * amp_sv[0]; @@ -717,8 +841,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 22 FFV1_0( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[21] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[8] -= 1. / 6. * amp_sv[0]; @@ -731,8 +858,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 23 FFV1_0( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[22] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -745,8 +875,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 24 FFV1_0( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[23] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[10] += 1. / 6. * amp_sv[0]; @@ -759,8 +892,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 25 FFV1_0( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[24] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -773,8 +909,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 26 FFV1_0( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[25] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -787,8 +926,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 27 FFV1_0( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[26] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[10] += 1. / 6. * amp_sv[0]; @@ -801,8 +943,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 28 VVV5_0( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[27] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -815,8 +960,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 29 FFV1_0( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[28] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[10] += 1. / 2. * amp_sv[0]; @@ -829,8 +977,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 30 VVV5_0( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[29] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -844,8 +995,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 31 FFV1_0( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[30] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -858,8 +1012,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 32 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[31] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -872,8 +1029,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 33 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[32] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -886,8 +1046,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 34 FFV1_0( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[33] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -900,8 +1063,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 35 FFV1_0( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[34] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[6] += 1. / 6. * amp_sv[0]; @@ -914,8 +1080,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 36 FFV1_0( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[35] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 6. * amp_sv[0]; jamp_sv[6] += 1. / 2. * amp_sv[0]; @@ -928,8 +1097,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 37 FFV1_0( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[36] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -942,8 +1114,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 38 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[37] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -956,8 +1131,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 39 FFV1_0( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[38] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -970,8 +1148,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 40 VVV5_0( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[39] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -985,8 +1166,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 41 FFV1_0( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[40] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -999,8 +1183,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 42 FFV1_0( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[41] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -1013,8 +1200,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 43 FFV1_0( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[42] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * amp_sv[0]; jamp_sv[7] -= 1. / 2. * amp_sv[0]; @@ -1027,8 +1217,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 44 FFV1_0( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[43] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[7] -= 1. / 6. * amp_sv[0]; @@ -1041,8 +1234,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 45 FFV1_0( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[44] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1055,8 +1251,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 46 FFV1_0( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[45] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1069,8 +1268,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 47 FFV1_0( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[46] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1083,8 +1285,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 48 VVV5_0( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[47] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1097,8 +1302,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 49 FFV1_0( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[48] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[9] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1111,8 +1319,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 50 VVV5_0( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[49] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1125,8 +1336,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 51 FFV1_0( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[50] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 2. * amp_sv[0]; jamp_sv[9] += 1. / 6. * amp_sv[0]; @@ -1139,8 +1353,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 52 FFV1_0( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[51] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1153,8 +1370,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 53 FFV1_0( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[52] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[8] -= 1. / 6. * amp_sv[0]; jamp_sv[9] += 1. / 2. * amp_sv[0]; @@ -1167,8 +1387,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 54 FFV1_0( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[53] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1181,8 +1404,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 55 FFV1_0( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[54] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[2] += 1. / 6. * amp_sv[0]; @@ -1195,8 +1421,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 56 FFV1_0( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[55] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1209,8 +1438,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 57 FFV1_0( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[56] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 6. * amp_sv[0]; jamp_sv[2] += 1. / 2. * amp_sv[0]; @@ -1223,8 +1455,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 58 FFV1_0( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[57] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1237,8 +1472,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 59 FFV1_0( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[58] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 2. * amp_sv[0]; jamp_sv[11] -= 1. / 6. * amp_sv[0]; @@ -1251,8 +1489,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 60 FFV1_0( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[59] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1265,8 +1506,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 61 FFV1_0( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[60] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[10] += 1. / 6. * amp_sv[0]; jamp_sv[11] -= 1. / 2. * amp_sv[0]; @@ -1279,8 +1523,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 62 FFV1_0( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[61] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1293,8 +1540,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 63 FFV1_0( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[62] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[3] -= 1. / 6. * amp_sv[0]; @@ -1307,8 +1557,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 64 FFV1_0( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[63] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1321,8 +1574,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 65 FFV1_0( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[64] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 6. * amp_sv[0]; jamp_sv[3] -= 1. / 2. * amp_sv[0]; @@ -1335,8 +1591,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 66 FFV1_0( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[65] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0]; @@ -1371,8 +1630,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 68 VVV5_0( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[67] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1387,8 +1649,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 69 VVV5_0( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[68] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[2] += 1. / 2. * amp_sv[0]; jamp_sv[5] -= 1. / 2. * amp_sv[0]; @@ -1425,8 +1690,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 71 VVV5_0( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[70] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -1441,8 +1709,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 72 VVV5_0( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[71] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[3] -= 1. / 2. * amp_sv[0]; jamp_sv[4] += 1. / 2. * amp_sv[0]; @@ -1810,9 +2081,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1888,8 +2158,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1951,25 +2220,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -2014,16 +2293,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -2091,6 +2398,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2100,6 +2408,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -2111,8 +2421,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -2140,7 +2452,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -2156,7 +2467,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2170,11 +2481,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -2186,6 +2500,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -2194,9 +2509,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -2207,9 +2523,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2247,40 +2569,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2295,7 +2583,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -2343,82 +2632,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2443,13 +2747,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2457,13 +2755,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h index f8f13801dd..7e444f2546 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h new file mode 100644 index 0000000000..b9f07de180 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H +#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 72; + +} + +#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt index 065f7b4329..239e177d5b 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt +++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,27 +49,27 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F set auto_convert_model T save options auto_convert_model -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t -INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models  ---2025-10-22 11:49:03-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz +INFO: download model from http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz to the following directory: /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models  +--2025-12-11 12:38:40-- http://feynrules.irmp.ucl.ac.be/raw-attachment/wiki/SMEFT/SMEFTsim_topU3l_MwScheme_UFO.tar.gz Resolving feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)... 130.104.48.109 Connecting to feynrules.irmp.ucl.ac.be (feynrules.irmp.ucl.ac.be)|130.104.48.109|:80... connected. HTTP request sent, awaiting response... 200 Ok Length: 80562 (79K) [application/x-tar] Saving to: ‘tmp.tgz’ - 0K .......... .......... .......... .......... .......... 63% 830K 0s - 50K .......... .......... ........ 100% 124M=0.06s + 0K .......... .......... .......... .......... .......... 63% 880K 0s + 50K .......... .......... ........ 100% 1.37M=0.08s -2025-10-22 11:49:03 (1.27 MB/s) - ‘tmp.tgz’ saved [80562/80562] +2025-12-11 12:38:40 (1018 KB/s) - ‘tmp.tgz’ saved [80562/80562] SMEFTsim_topU3l_MwScheme_UFO/ SMEFTsim_topU3l_MwScheme_UFO/__init__.py @@ -89,7 +90,7 @@ SMEFTsim_topU3l_MwScheme_UFO/lorentz.py SMEFTsim_topU3l_MwScheme_UFO/vertices.py SMEFTsim_topU3l_MwScheme_UFO/restrict_SMlimit_massless.dat fail to load model but auto_convert_model is on True. Trying to convert the model -convert model /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO +convert model /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/models/SMEFTsim_topU3l_MwScheme_UFO retry the load of the model import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t INFO: load particles @@ -107,7 +108,7 @@ INFO: load vertices DEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1)  DEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3)  DEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1)  -DEBUG: model prefixing takes 0.07803130149841309  +DEBUG: model prefixing takes 0.03392601013183594  INFO: Change particles name to pass to MG5 convention Defined multiparticle p = g u c d s u~ c~ d~ s~ Defined multiparticle j = g u c d s u~ c~ d~ s~ @@ -125,33 +126,33 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Process has 72 diagrams -1 processes with 72 diagrams generated in 2.695 s +1 processes with 72 diagrams generated in 1.441 s Total: 1 processes with 72 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. -Generated helas calls for 1 subprocesses (72 diagrams) in 0.127 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. +Generated helas calls for 1 subprocesses (72 diagrams) in 0.068 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV5 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV9 routines ALOHA: aloha creates VVVV10 routines -ALOHA: aloha creates 5 routines in 0.281 s +ALOHA: aloha creates 5 routines in 0.148 s VVV5 VVV5 FFV1 @@ -161,17 +162,17 @@ ALOHA: aloha creates 5 routines in 0.281 s VVVV1 VVVV9 VVVV10 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h -INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h +INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. quit -real 0m4.417s -user 0m3.862s -sys 0m0.114s -Code generation completed in 5 seconds +real 0m4.179s +user 0m2.251s +sys 0m0.181s +Code generation completed in 4 seconds diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h index 24800c08c9..50496fa2bf 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc index 8c3316992a..d7b2ade5f4 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -1758,9 +1819,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -1836,8 +1896,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -1899,25 +1958,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -1962,16 +2031,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -2039,6 +2136,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -2048,6 +2146,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -2059,8 +2159,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -2088,7 +2190,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -2104,7 +2205,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -2118,11 +2219,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -2134,6 +2238,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -2142,9 +2247,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -2155,9 +2261,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -2195,40 +2307,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -2243,7 +2321,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -2291,82 +2370,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -2391,13 +2485,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -2405,13 +2493,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h index f8f13801dd..7e444f2546 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h new file mode 100644 index 0000000000..b9f07de180 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H +#define MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 72; + +} + +#endif // MG5_CONFIG_SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_H \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt index 01968dc817..cab1a5820a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.071 s +1 processes with 6 diagrams generated in 0.054 s Total: 1 processes with 6 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 @@ -575,52 +576,54 @@ FileWriter t1 t1~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x -DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (6 diagrams) in 0.007 s -Wrote files for 16 helas calls in 0.065 s +DEBUG: len(subproc_diagrams_for_config) =  5 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (6 diagrams) in 0.005 s +Wrote files for 16 helas calls in 2.514 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.125 s +ALOHA: aloha creates 3 routines in 0.105 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 6 routines in 0.118 s +ALOHA: aloha creates 6 routines in 0.089 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README Run "open index.html" to see more information about this process. quit -real 0m2.714s -user 0m2.329s -sys 0m0.381s -Code generation completed in 3 seconds +real 0m11.941s +user 0m1.873s +sys 0m0.654s +Code generation completed in 12 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -641,9 +644,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -670,9 +673,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat index 9cfb7ac1a2..0c5a1bdd83 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/ident_card.dat @@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3 msl2 1 1 mdl_RmL21x1 msl2 3 3 mdl_RmL23x3 msoft 1 mdl_RMx1 +msoft 2 mdl_RMx2 msoft 21 mdl_mHd2 msoft 22 mdl_mHu2 -msoft 2 mdl_RMx2 msoft 3 mdl_RMx3 msq2 1 1 mdl_RmQ21x1 msq2 3 3 mdl_RmQ23x3 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc index 6acb037f00..a3d72e8ed8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/param_card.inc @@ -81,9 +81,9 @@ MDL_RML21X1 = 3.815567D+04 MDL_RML23X3 = 3.782868D+04 MDL_RMX1 = 1.013965D+02 + MDL_RMX2 = 1.915042D+02 MDL_MHD2 = 3.233749D+04 MDL_MHU2 = -1.288001D+05 - MDL_RMX2 = 1.915042D+02 MDL_RMX3 = 5.882630D+02 MDL_RMQ21X1 = 2.998367D+05 MDL_RMQ23X3 = 2.487654D+05 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc index 89c03a7876..0b6ddbf4aa 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -400,8 +461,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 VSS1_0( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0]; @@ -414,8 +478,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; @@ -427,8 +494,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 4 VSS1_0( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[3] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += amp_sv[0]; @@ -440,8 +510,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 5 VSS1_0( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[4] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; @@ -453,8 +526,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 6 VSS1_0( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[5] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] += amp_sv[0]; @@ -755,9 +831,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -833,8 +908,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -896,25 +970,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -959,16 +1043,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1036,6 +1148,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1045,6 +1158,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1056,8 +1171,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1085,7 +1202,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1101,7 +1217,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1115,11 +1231,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1131,6 +1250,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1139,9 +1259,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1152,9 +1273,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1192,40 +1319,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1240,7 +1333,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1288,82 +1382,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1388,13 +1497,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1402,13 +1505,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h index 0c297072b2..bee85ff0b7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h new file mode 100644 index 0000000000..998cb0ade6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 6; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt index 0c5c2efcaf..ca077e5af2 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt +++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Process has 6 diagrams -1 processes with 6 diagrams generated in 0.074 s +1 processes with 6 diagrams generated in 0.055 s Total: 1 processes with 6 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t1 t1~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. -Generated helas calls for 1 subprocesses (6 diagrams) in 0.006 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. +Generated helas calls for 1 subprocesses (6 diagrams) in 0.010 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates VSS1 routines ALOHA: aloha creates VVSS1 routines -ALOHA: aloha creates 3 routines in 0.126 s +ALOHA: aloha creates 3 routines in 0.102 s VVV1 VSS1 VSS1 VSS1 VVSS1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. quit -real 0m1.007s -user 0m0.940s -sys 0m0.062s -Code generation completed in 1 seconds +real 0m2.643s +user 0m0.729s +sys 0m0.132s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc index e4718e0681..3db12d56be 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -756,9 +817,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -834,8 +894,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -897,25 +956,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -960,16 +1029,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1037,6 +1134,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1046,6 +1144,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1057,8 +1157,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1086,7 +1188,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1102,7 +1203,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1116,11 +1217,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1132,6 +1236,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1140,9 +1245,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1153,9 +1259,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1193,40 +1305,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1241,7 +1319,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1289,82 +1368,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1389,13 +1483,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1403,13 +1491,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h index 0c297072b2..bee85ff0b7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h new file mode 100644 index 0000000000..998cb0ade6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 6; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_T1T1X_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt index 463187a10a..3ece1f2ceb 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -549,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.089 s +1 processes with 3 diagrams generated in 0.077 s Total: 1 processes with 3 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32 Output will be done with PLUGIN: CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: opt['output_options']['vector_size'] =  32 [export_v4.py at line 4168]  Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt INFO: remove old information in CODEGEN_mad_susy_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  -WARNING: File exists /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards  +WARNING: File exists /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses  INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 @@ -575,49 +576,51 @@ FileWriter t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1552]  -DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1576]  -DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1577]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.076 s +DEBUG: len(subproc_diagrams_for_config) =  3 [model_handling.py at line 1564]  +DEBUG: iconfig_to_diag =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1588]  +DEBUG: diag_to_iconfig =  {1: 1, 2: 2, 3: 3} [model_handling.py at line 1589]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 1.977 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.081 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.120 s +ALOHA: aloha creates 4 routines in 0.072 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. The option zerowidth_tchannel is modified [True] but will not be written in the configuration files. If you want to make this value the default for future session, you can run 'save options --all' -save configuration file to /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +save configuration file to /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate jpeg diagrams INFO: Generate web pages -DEBUG: result.returncode =  0 [output.py at line 273]  -Output to directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. +DEBUG: result.returncode =  0 [output.py at line 275]  +Output to directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done. Type "launch" to generate events from this process, or see -/home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m3.218s -user 0m2.778s -sys 0m0.430s -Code generation completed in 3 seconds +real 0m11.901s +user 0m1.762s +sys 0m0.693s +Code generation completed in 12 seconds +/shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/bin/internal/banner.py:3356: SyntaxWarning: invalid escape sequence '\s' + function_pat = re.compile('^\s+(?:SUBROUTINE|(?:%(type)s)\s+function)\s+([a-zA-Z]\w*)' \ ************************************************************ * * * W E L C O M E to * @@ -638,9 +641,9 @@ Code generation completed in 3 seconds * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run @@ -667,9 +670,9 @@ launch in debug mode * Type 'help' for in-line help. * * * ************************************************************ -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo/input/mg5_configuration.txt -INFO: load configuration from /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt +INFO: load configuration from /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat index 9cfb7ac1a2..0c5a1bdd83 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/ident_card.dat @@ -232,9 +232,9 @@ mse2 3 3 mdl_RmE23x3 msl2 1 1 mdl_RmL21x1 msl2 3 3 mdl_RmL23x3 msoft 1 mdl_RMx1 +msoft 2 mdl_RMx2 msoft 21 mdl_mHd2 msoft 22 mdl_mHu2 -msoft 2 mdl_RMx2 msoft 3 mdl_RMx3 msq2 1 1 mdl_RmQ21x1 msq2 3 3 mdl_RmQ23x3 diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt index 97e103a317..a0212bfb62 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt +++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt @@ -235,7 +235,7 @@ # pineappl = pineappl -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo # MG5 MAIN DIRECTORY -#mg5_path = /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/mg5amcnlo +#mg5_path = /shared/roiser/sw/madgraph4gpu/MG5aMC/mg5amcnlo diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts index f10336e42e..74463b32eb 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts @@ -5,8 +5,8 @@ GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime PYTHIA8_PATH=NotInstalled -STDLIB_FLAG= STDLIB=-lstdc++ +STDLIB_FLAG= #end_of_make_opts_variables BIASLIBDIR=../../../lib/ diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc index 6acb037f00..a3d72e8ed8 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc +++ b/epochX/cudacpp/susy_gg_tt.mad/Source/param_card.inc @@ -81,9 +81,9 @@ MDL_RML21X1 = 3.815567D+04 MDL_RML23X3 = 3.782868D+04 MDL_RMX1 = 1.013965D+02 + MDL_RMX2 = 1.915042D+02 MDL_MHD2 = 3.233749D+04 MDL_MHU2 = -1.288001D+05 - MDL_RMX2 = 1.915042D+02 MDL_RMX3 = 5.882630D+02 MDL_RMQ21X1 = 2.998367D+05 MDL_RMQ23X3 = 2.487654D+05 diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 98722d3089..14482e097b 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -391,8 +452,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 1 FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[0] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; @@ -405,8 +469,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 2 FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[1] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[0] -= amp_sv[0]; @@ -418,8 +485,11 @@ namespace mg5amcCpu // Amplitude(s) for diagram number 3 FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] ); - if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] ); + if( storeChannelWeights ) + { + numerators_sv[2] += cxabs2( amp_sv[0] ); + denominators_sv += cxabs2( amp_sv[0] ); + } #endif jamp_sv[1] -= amp_sv[0]; @@ -732,9 +802,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -810,8 +879,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -873,25 +941,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -936,16 +1014,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1013,6 +1119,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1022,6 +1129,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1033,8 +1142,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1062,7 +1173,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1078,7 +1188,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1092,11 +1202,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1108,6 +1221,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1116,9 +1230,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1129,9 +1244,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1169,40 +1290,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1217,7 +1304,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1265,82 +1353,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1365,13 +1468,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1379,13 +1476,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 256c5780e4..99f978df4c 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h new file mode 100644 index 0000000000..04a79dca0d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 9c4080b86d..88fc5f557b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -1,3 +1,4 @@ +WARNING:root:python3.12 support is still experimental. For the moment re-weighting is NOT working and do expect a LOT of syntax warning. We do not advise python3.12 for production for the moment. Note that this is a development version. This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) @@ -48,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg +import /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 @@ -56,7 +57,7 @@ set zerowidth_tchannel F import model MSSM_SLHA2 INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.6192381381988525  +DEBUG: model prefixing takes 0.23334097862243652  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -552,45 +553,45 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.063 s +1 processes with 3 diagrams generated in 0.074 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Output will be done with PLUGIN: CUDACPP_OUTPUT -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 175]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 180]  -INFO: Creating subdirectories in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 177]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 182]  +INFO: Creating subdirectories in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 222]  -DEBUG: type(subproc_group)= [output.py at line 223]  -DEBUG: type(fortran_model)= [output.py at line 224]  -DEBUG: type(me)= me=0 [output.py at line 225]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 226]  -INFO: Creating files in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 224]  +DEBUG: type(subproc_group)= [output.py at line 225]  +DEBUG: type(fortran_model)= [output.py at line 226]  +DEBUG: type(me)= me=0 [output.py at line 227]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.h', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'cudacpp_overlay.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'umami.h', 'umami.cc', 'perf.py', 'profile.sh'] [output.py at line 228]  +INFO: Creating files in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.095 s +ALOHA: aloha creates 2 routines in 0.068 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /home/dmass/Development/madgraph4gpu/release-v1.01.01/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +INFO: /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /shared/roiser/sw/madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.922s -user 0m1.810s -sys 0m0.099s -Code generation completed in 2 seconds +real 0m3.608s +user 0m1.150s +sys 0m0.169s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h index 8a37d1f947..026253f354 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -49,6 +49,9 @@ #define gpuStream_t cudaStream_t #define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( cudaMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( cudaFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( cudaStreamSynchronize( stream ) ) #define gpuBlasStatus_t cublasStatus_t #define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS @@ -113,6 +116,9 @@ #define gpuStream_t hipStream_t #define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) ) #define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) ) +#define gpuMallocAsync( ptr, size, stream ) checkGpu( hipMallocAsync( ptr, size, stream ) ) +#define gpuFreeAsync( ptr, stream ) checkGpu( hipFreeAsync( ptr, stream ) ) +#define gpuStreamSynchronize( stream ) checkGpu( hipStreamSynchronize( stream ) ) #define gpuBlasStatus_t hipblasStatus_t #define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 5ede45b123..469edd8d9e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -162,7 +162,7 @@ namespace mg5amcCpu , NumberOfEvents( nevt ) , m_couplings( nevt ) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) + , m_numerators( nevt * CPPProcess::ndiagrams ) , m_denominators( nevt ) #endif { @@ -220,7 +220,7 @@ namespace mg5amcCpu computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nullptr, true, nevt() ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ); @@ -356,7 +356,7 @@ namespace mg5amcGpu m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering - m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() * CPPProcess::ndiagrams ) ); m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) ); #endif // Decide at runtime whether to use BLAS for color sums @@ -476,7 +476,7 @@ namespace mg5amcGpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated) // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering) - m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); + m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ndiagrams * nevt ) ); m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) ); #endif #ifndef MGONGPU_HAS_NO_BLAS @@ -507,7 +507,7 @@ namespace mg5amcGpu #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, nullptr, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), nullptr, true, m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); #else assert( useChannelIds == false ); sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), ghelAllBlasTmp, pBlasHandle, m_helStreams, m_gpublocks, m_gputhreads ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h index 63c17a68fa..50a6aaef4d 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -128,6 +128,14 @@ namespace mg5amcCpu #endif } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (SCALAR OR VECTOR) ===> fptype_sv* kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv* + kernelAccessP( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] static constexpr auto kernelAccessConst_s = diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h index c5e79dc1b1..76849a871e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -13,6 +13,7 @@ #include "CPPProcess.h" #include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" +#include "processConfig.h" #include @@ -295,7 +296,8 @@ namespace mg5amcCpu typedef BufferBase BufferNumerators; // The size (number of elements) per event in a memory buffer for numerators - constexpr size_t sizePerEventNumerators = 1; + // (should be equal to the number of diagrams in the process) + constexpr size_t sizePerEventNumerators = processConfig::ndiagrams; #ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for numerators diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index b88ebd5b4a..63f1df1073 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -27,6 +27,7 @@ #include "MemoryAccessMomenta.h" #include "MemoryAccessWavefunctions.h" #include "color_sum.h" +#include "processConfig.h" #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessDenominators.h" @@ -98,6 +99,69 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + __device__ INLINE unsigned int getChannelId( const unsigned int* allChannelIds +#ifndef MGONGPUCPP_GPUIMPL + , + const int ievt00, + bool sanityCheckMixedPrecision = true +#endif + ) + { + unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr +#ifdef MGONGPUCPP_GPUIMPL + using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page + if( allChannelIds != nullptr ) + { + const unsigned int* channelIds = allChannelIds; // fix #899 (distinguish channelIds and allChannelIds) + const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) + // NB: channelIds_sv is a scalar in CUDA + channelId = channelIds_sv; + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + } +#else // Cuda or C++ + using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events + // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) + // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + if( allChannelIds != nullptr ) + { + // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) + const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 + uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) +#ifndef MGONGPU_CPPSIMD + // NB: channelIds_sv is a scalar in no-SIMD C++ + channelId = channelIds_sv; +#else + // NB: channelIds_sv is a vector in SIMD C++ + channelId = channelIds_sv[0]; // element[0] + for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] + { + assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId + } +#endif + assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) + if( sanityCheckMixedPrecision ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) + const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 + uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) + // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924 + for( int i = 0; i < neppV; ++i ) + { + assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector + } +#endif + } + } +#endif // MGONGPUCPP_GPUIMPL + return channelId; + } +#endif // MGONGPU_SUPPORTS_MULTICHANNEL + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) @@ -239,7 +303,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL fptype* allJamps, // output: jamp[2*ncolor*nevt] buffer for one helicity _within a super-buffer for dcNGoodHel helicities_ #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911) + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype* colAllJamp2s, // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable) @@ -248,7 +312,7 @@ namespace mg5amcCpu #else cxtype_sv* allJamp_sv, // output: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for this helicity #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int channelId, // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector + bool storeChannelWeights, fptype* allNumerators, // input/output: multichannel numerators[nevt], add helicity ihel fptype* allDenominators, // input/output: multichannel denominators[nevt], add helicity ihel fptype_sv* jamp2_sv, // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) @@ -344,7 +408,8 @@ namespace mg5amcCpu const fptype* COUPs[nxcoup]; for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = allNumerators; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + fptype* numerators = &allNumerators[ievt * processConfig::ndiagrams]; fptype* denominators = allDenominators; #endif #else @@ -357,7 +422,7 @@ namespace mg5amcCpu for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // FIX #823 COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); #endif #endif @@ -366,12 +431,8 @@ namespace mg5amcCpu for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#ifdef MGONGPUCPP_GPUIMPL - // SCALAR channelId for the current event (CUDA) - unsigned int channelId = gpu_channelId( allChannelIds ); -#endif // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); #endif @@ -729,9 +790,8 @@ namespace mg5amcCpu gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) ); // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering - constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement - gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, allChannelIds, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); + constexpr fptype_sv* allJamp2s = nullptr; // no need for color selection during helicity filtering + gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, false, allNumerators, allDenominators, allJamp2s, gpublocks * gputhreads ); #else gpuLaunchKernel( calculate_jamps, gpublocks, gputhreads, ihel, allmomenta, allcouplings, allJamps, gpublocks * gputhreads ); #endif @@ -807,8 +867,7 @@ namespace mg5amcCpu cxtype_sv jamp_sv[ncolor] = {}; // all zeros #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */ - constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, false, allNumerators, allDenominators, jamp2_sv, ievt00 ); //maxtry? #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry? #endif /* clang-format on */ @@ -870,25 +929,35 @@ namespace mg5amcCpu fptype* ghelAllNumerators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + bool storeChannelWeights, // if true, compute final multichannel weights + bool mulChannelWeight, // if true, multiply matrix element by channel weight #endif - const fptype globaldenom ) /* clang-format on */ + const fptype globaldenom) /* clang-format on */ { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) allMEs[ievt] /= globaldenom; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const int nevt = gridDim.x * blockDim.x; - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( storeChannelWeights ) // fix segfault #892 (not 'channelIds[0] != 0') { fptype* totAllNumerators = ghelAllNumerators; // reuse "helicity #0" buffer to compute the total over all helicities fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1 { - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - totAllNumerators[ievt] += hAllNumerators[ievt]; totAllDenominators[ievt] += hAllDenominators[ievt]; + fptype* hAllNumerators = ghelAllNumerators + ( ievt + ighel * nevt ) * processConfig::ndiagrams; + fptype* firstNumerator = ghelAllNumerators + ievt * processConfig::ndiagrams; + for( int idiag = 0; idiag < processConfig::ndiagrams; ++idiag ) + { + firstNumerator[idiag] += hAllNumerators[idiag]; + } + } + if( mulChannelWeight ) + { + unsigned int channelId = allChannelIds[ievt]; + allMEs[ievt] *= totAllNumerators[channelId - 1 + ievt * processConfig::ndiagrams] / totAllDenominators[ievt]; } - allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt]; } #endif return; @@ -933,16 +1002,44 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL #ifdef MGONGPU_SUPPORTS_MULTICHANNEL __global__ void - select_col( int* allselcol, // output: color selection[nevt] - const fptype* allrndcol, // input: random numbers[nevt] for color selection - const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) - const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) - const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + select_col_and_diag( int* allselcol, // output: color selection[nevt] + unsigned int* allDiagramIdsOut, // output: sampled diagram ids + const fptype* allrndcol, // input: random numbers[nevt] for color selection + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram selection + const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911) + const fptype_sv* allJamp2s, // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled) + const fptype* allNumerators, // input: all numerators + const fptype* allDenominators, // input: all denominators + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) { const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) // SCALAR channelId for the current event (CUDA) unsigned int channelId = gpu_channelId( allChannelIds ); // Event-by-event random choice of color #402 + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt * processConfig::ndiagrams + ichan]; + } + channelId = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt * processConfig::ndiagrams + ichan]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) + { + channelId = ichan + 1; + break; + } + } + allDiagramIdsOut[ievt] = channelId; + } + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { if( channelId > mgOnGpu::nchannels ) @@ -1010,6 +1107,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for diagram sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -1019,6 +1117,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: jamp[2*ncolor*nGoodHel*nevt] super-buffer for nGoodHel <= ncomb individual helicities @@ -1030,8 +1130,10 @@ namespace mg5amcCpu #else #ifdef MGONGPU_SUPPORTS_MULTICHANNEL int* allselcol, // output: helicity selection[nevt] - fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities - fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif @@ -1059,7 +1161,6 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events - using CID_ACCESS = HostAccessChannelIds; // non-trivial access: buffer includes all events #endif #endif @@ -1075,7 +1176,7 @@ namespace mg5amcCpu gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) ); - gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) ); + gpuMemset( ghelAllNumerators, 0, cNGoodHel * processConfig::ndiagrams * nevt * sizeof( fptype ) ); gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) ); #endif gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) ); @@ -1089,11 +1190,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv = fptype_sv{ 0 }; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - numerators_sv = fptype_sv{ 0 }; + for( int i = 0; i < processConfig::ndiagrams; ++i ) + { + numerators_sv[i] = fptype_sv{ 0 }; + } denominators_sv = fptype_sv{ 0 }; #endif } @@ -1105,6 +1209,7 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per GPU thread) *** + // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream) // (1) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s @@ -1113,9 +1218,10 @@ namespace mg5amcCpu const int ihel = cGoodHel[ighel]; fptype* hAllJamps = ghelAllJamps + ighel * nevt; // HACK: bypass DeviceAccessJamp (consistent with layout defined there) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - fptype* hAllNumerators = ghelAllNumerators + ighel * nevt; + fptype* hAllNumerators = ghelAllNumerators + ighel * nevt * processConfig::ndiagrams; fptype* hAllDenominators = ghelAllDenominators + ighel * nevt; - gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, allChannelIds, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, storeChannelWeights, hAllNumerators, hAllDenominators, colAllJamp2s, nevt ); #else gpuLaunchKernelStream( calculate_jamps, gpublocks, gputhreads, ghelStreams[ighel], ihel, allmomenta, allcouplings, hAllJamps, nevt ); #endif @@ -1126,9 +1232,15 @@ namespace mg5amcCpu // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism) gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads ); + #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // Event-by-event random choice of color #402 - gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, storeChannelWeights, mulChannelWeight, helcolDenominators[0] ); + + // Event-by-event random choice of color and diagram #402 + gpuLaunchKernel( select_col_and_diag, gpublocks, gputhreads, allselcol, allDiagramIdsOut, allrndcol, allrnddiagram, allChannelIds, colAllJamp2s, ghelAllNumerators, ghelAllDenominators, gpublocks * gputhreads ); +#else + gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); #endif // *** END OF PART 1a - CUDA (one event per GPU thread) *** @@ -1166,40 +1278,6 @@ namespace mg5amcCpu const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time #else const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time -#endif -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s) - // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr - if( allChannelIds != nullptr ) - { - // First - and/or only - neppV page of channels (iParity=0 => ievt0 = ievt00 + 0 * neppV) - const unsigned int* channelIds = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 ); // fix bug #899/#911 - uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams) -#ifndef MGONGPU_CPPSIMD - // NB: channelIds_sv is a scalar in no-SIMD C++ - channelId = channelIds_sv; -#else - // NB: channelIds_sv is a vector in SIMD C++ - channelId = channelIds_sv[0]; // element[0] - for( int i = 1; i < neppV; ++i ) // elements[1...neppV-1] - { - assert( channelId == channelIds_sv[i] ); // SANITY CHECK #898: check that all events in a SIMD vector have the same channelId - } -#endif - assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr) -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV) - const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911 - uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 ); // fix #895 (compute this only once for all diagrams) - // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - for( int i = 0; i < neppV; ++i ) - { - assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector - } -#endif - } #endif // Running sum of partial amplitudes squared for event by event color selection (#402) // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps) @@ -1214,7 +1292,8 @@ namespace mg5amcCpu cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps) #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924 - calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); + bool storeChannelWeights = allChannelIds != nullptr || allrnddiagram != nullptr; + calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, storeChannelWeights, allNumerators, allDenominators, jamp2_sv, ievt00 ); #else calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); #endif @@ -1262,82 +1341,97 @@ namespace mg5amcCpu } #endif } -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - // Event-by-event random choice of color #402 - if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) - { - if( channelId > mgOnGpu::nchannels ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); - assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 - } - // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) - // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! - const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) - if( iconfig <= 0 ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); - assert( iconfig > 0 ); // SANITY CHECK #917 - } - else if( iconfig > (int)mgOnGpu::nconfigSDE ) - { - printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); - assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 - } - fptype_sv targetamp[ncolor] = { 0 }; - // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + const int vecsize = 2 * neppV; +#else + const int vecsize = neppV; +#endif + unsigned int channelIdVec[vecsize]; + if( allChannelIds != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1) - if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + const int ievt = ievt00 + ieppV; + channelIdVec[ieppV] = allChannelIds[ievt]; } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) + } + + // Event-by-event random choice of channel + if( allrnddiagram != nullptr ) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) { const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + fptype numerator_sum = 0., normalization = 0.; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) { -#if defined MGONGPU_CPPSIMD - // Add volatile here to avoid SIGFPE crashes in FPTYPE=f cpp512z builds (#845) - volatile const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); -#else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); -#endif - if( okcol ) + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + normalization += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + } + channelIdVec[ieppV] = mgOnGpu::nchannels; + for( unsigned int ichan = 0; ichan < mgOnGpu::nchannels; ichan++ ) + { + if( mgOnGpu::channel2iconfig[ichan] == -1 ) continue; + numerator_sum += allNumerators[ievt / neppV * neppV * processConfig::ndiagrams + + ichan * neppV + ieppV % neppV]; + if( allrnddiagram[ievt] < numerator_sum / normalization ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); + channelIdVec[ieppV] = ichan + 1; break; } } -#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + allDiagramIdsOut[ievt] = channelIdVec[ieppV]; + } + } + + // Event-by-event random choice of color #402 + if( allChannelIds != nullptr || allrnddiagram != nullptr ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) + { + for( int ieppV = 0; ieppV < vecsize; ++ieppV ) + { + unsigned int channelId = channelIdVec[ieppV]; + if( channelId > mgOnGpu::nchannels ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels ); + assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910 + } + // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig) + // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int! + const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853) + if( iconfig <= 0 ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId ); + assert( iconfig > 0 ); // SANITY CHECK #917 + } + else if( iconfig > (int)mgOnGpu::nconfigSDE ) + { + printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE ); + assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917 + } + fptype targetamp[ncolor] = { 0 }; + // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1] for( int icolC = 0; icolC < ncolor; icolC++ ) { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += + jamp2_sv[icolC + ncolor * ( ieppV / neppV )][ieppV % neppV]; + } + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - //printf( "sigmaKin: ievt2=%d icol=%d\n", ievt2, icolC+1 ); + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 ); break; } } -#endif } } else @@ -1362,13 +1456,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] ); -#else - gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] ); -#endif -#else +#ifndef MGONGPUCPP_GPUIMPL for( int ipagV = 0; ipagV < npagV; ++ipagV ) { const int ievt0 = ipagV * neppV; @@ -1376,13 +1464,14 @@ namespace mg5amcCpu fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') + if( mulChannelWeight && allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0') { - fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + const unsigned int channelId = getChannelId( allChannelIds, ievt0, false ); + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 * processConfig::ndiagrams ); fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); - fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv* numerators_sv = NUM_ACCESS::kernelAccessP( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; + MEs_sv *= numerators_sv[channelId - 1] / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 256c5780e4..99f978df4c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -163,6 +163,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -171,6 +172,8 @@ namespace mg5amcCpu fptype* colAllJamp2s, // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities fptype* ghelAllNumerators, // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllDenominators, // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif fptype* ghelAllMEs, // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel) fptype* ghelAllJamps, // tmp: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities @@ -187,6 +190,7 @@ namespace mg5amcCpu #ifdef MGONGPU_SUPPORTS_MULTICHANNEL const fptype* allrndcol, // input: random numbers[nevt] for color selection const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899) + const fptype* allrnddiagram, // input: random numbers[nevt] for channel sampling #endif fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities int* allselhel, // output: helicity selection[nevt] @@ -194,6 +198,8 @@ namespace mg5amcCpu int* allselcol, // output: helicity selection[nevt] fptype* allNumerators, // tmp: multichannel numerators[nevt], running_sum_over_helicities fptype* allDenominators, // tmp: multichannel denominators[nevt], running_sum_over_helicities + unsigned int* allDiagramIdsOut, // output: multichannel channelIds[nevt] (1 to #diagrams) + bool mulChannelWeight, // if true, multiply channel weight to ME output #endif const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h new file mode 100644 index 0000000000..04a79dca0d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/processConfig.h @@ -0,0 +1,16 @@ +// Copyright (C) 2025 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (May 2025) for the MG5aMC CUDACPP plugin. +// Further modified by: ... for the MG5aMC CUDACPP plugin. + + +#ifndef MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H +#define MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H 1 + +namespace processConfig { + + constexpr int ndiagrams = 3; + +} + +#endif // MG5_CONFIG_SIGMA_MSSM_SLHA2_GG_TTX_H \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc new file mode 120000 index 0000000000..e603bf8fdb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.cc @@ -0,0 +1 @@ +../umami.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h new file mode 120000 index 0000000000..1267019e18 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/umami.h @@ -0,0 +1 @@ +../umami.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index e7360b29e2..e093865b60 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -834,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(BUILDDIR)/umami_cpp.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX) -gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o +gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o $(BUILDDIR)/umami_$(GPUSUFFIX).o gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc new file mode 100644 index 0000000000..2b52267519 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.cc @@ -0,0 +1,530 @@ +#include "umami.h" + +#include "CPPProcess.h" +#include "GpuRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +namespace +{ + + void* initialize_impl( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + bool is_good_hel[CPPProcess::ncomb]; + sigmaKin_getGoodHel( + momenta, couplings, matrix_elements, numerators, denominators, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + is_good_hel, + count ); + sigmaKin_setGoodHel( is_good_hel ); + return nullptr; + } + + void initialize( + const fptype* momenta, + const fptype* couplings, + fptype* matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + fptype* color_jamps, +#endif + fptype* numerators, + fptype* denominators, + std::size_t count ) + { + // static local initialization is called exactly once in a thread-safe way + static void* dummy = initialize_impl( momenta, couplings, matrix_elements, +#ifdef MGONGPUCPP_GPUIMPL + color_jamps, +#endif + numerators, + denominators, + count ); + } + +#ifdef MGONGPUCPP_GPUIMPL + __device__ +#endif + void + transpose_momenta( const double* momenta_in, fptype* momenta_out, std::size_t i_event, std::size_t stride ) + { + std::size_t page_size = MemoryAccessMomentaBase::neppM; + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + for( std::size_t i_part = 0; i_part < CPPProcess::npar; ++i_part ) + { + for( std::size_t i_mom = 0; i_mom < 4; ++i_mom ) + { + momenta_out[i_page * CPPProcess::npar * 4 * page_size + + i_part * 4 * page_size + i_mom * page_size + i_vector] = momenta_in[stride * ( CPPProcess::npar * i_mom + i_part ) + i_event]; + } + } + } + +#ifdef MGONGPUCPP_GPUIMPL + + __global__ void copy_inputs( + const double* momenta_in, + const double* helicity_random_in, + const double* color_random_in, + const double* diagram_random_in, + const double* alpha_s_in, + fptype* momenta, + fptype* helicity_random, + fptype* color_random, + fptype* diagram_random, + fptype* g_s, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + transpose_momenta( &momenta_in[offset], momenta, i_event, stride ); + diagram_random[i_event] = diagram_random_in ? diagram_random_in[i_event + offset] : 0.5; + helicity_random[i_event] = helicity_random_in ? helicity_random_in[i_event + offset] : 0.5; + color_random[i_event] = color_random_in ? color_random_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + + __global__ void copy_outputs( + fptype* denominators, + fptype* numerators, + fptype* matrix_elements, + unsigned int* diagram_index, + int* color_index, + int* helicity_index, + double* m2_out, + double* amp2_out, + int* diagram_out, + int* color_out, + int* helicity_out, + std::size_t count, + std::size_t stride, + std::size_t offset ) + { + std::size_t i_event = blockDim.x * blockIdx.x + threadIdx.x; + if( i_event >= count ) return; + + if( m2_out ) m2_out[i_event + offset] = matrix_elements[i_event]; + if( amp2_out ) + { + double denominator = denominators[i_event]; + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_event * CPPProcess::ndiagrams + i_diag] / denominator; + } + } + if( diagram_out ) diagram_out[i_event + offset] = diagram_index[i_event] - 1; + if( color_out ) color_out[i_event + offset] = color_index[i_event] - 1; + if( helicity_out ) helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + +#endif // MGONGPUCPP_GPUIMPL + + struct InterfaceInstance + { + bool initialized = false; +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t hel_streams[CPPProcess::ncomb]; +#endif + }; + +} + +extern "C" +{ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ) + { + switch( meta_key ) + { + case UMAMI_META_DEVICE: + { + UmamiDevice& device = *static_cast( result ); +#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ + device = UMAMI_DEVICE_CUDA; +#elif defined( __HIPCC__ ) + device = UMAMI_DEVICE_HIP; +#endif +#else + device = UMAMI_DEVICE_CPU; +#endif + break; + } + case UMAMI_META_PARTICLE_COUNT: + *static_cast( result ) = CPPProcess::npar; + break; + case UMAMI_META_DIAGRAM_COUNT: + *static_cast( result ) = CPPProcess::ndiagrams; + break; + case UMAMI_META_HELICITY_COUNT: + *static_cast( result ) = CPPProcess::ncomb; + break; + case UMAMI_META_COLOR_COUNT: + return UMAMI_ERROR_UNSUPPORTED_META; + default: + return UMAMI_ERROR_UNSUPPORTED_META; + } + return UMAMI_SUCCESS; + } + + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ) + { + CPPProcess process; + process.initProc( param_card_path ); + auto instance = new InterfaceInstance(); + *handle = instance; +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + gpuStreamCreate( &instance->hel_streams[ihel] ); + } +#endif + return UMAMI_SUCCESS; + } + + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ) + { + return UMAMI_ERROR_NOT_IMPLEMENTED; + } + + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ) + { + const double* momenta_in = nullptr; + const double* alpha_s_in = nullptr; + const int* flavor_in = nullptr; // TODO: unused + const double* random_color_in = nullptr; + const double* random_helicity_in = nullptr; + const double* random_diagram_in = nullptr; + const int* diagram_in = nullptr; // TODO: unused + + for( std::size_t i = 0; i < input_count; ++i ) + { + const void* input = inputs[i]; + switch( input_keys[i] ) + { + case UMAMI_IN_MOMENTA: + momenta_in = static_cast( input ); + break; + case UMAMI_IN_ALPHA_S: + alpha_s_in = static_cast( input ); + break; + case UMAMI_IN_FLAVOR_INDEX: + flavor_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_COLOR: + random_color_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_HELICITY: + random_helicity_in = static_cast( input ); + break; + case UMAMI_IN_RANDOM_DIAGRAM: + random_diagram_in = static_cast( input ); + break; + case UMAMI_IN_HELICITY_INDEX: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + case UMAMI_IN_DIAGRAM_INDEX: + diagram_in = static_cast( input ); + break; + default: + return UMAMI_ERROR_UNSUPPORTED_INPUT; + } + } + if( !momenta_in ) return UMAMI_ERROR_MISSING_INPUT; + +#ifdef MGONGPUCPP_GPUIMPL + gpuStream_t gpu_stream = nullptr; +#endif + double* m2_out = nullptr; + double* amp2_out = nullptr; + int* diagram_out = nullptr; + int* color_out = nullptr; + int* helicity_out = nullptr; + for( std::size_t i = 0; i < output_count; ++i ) + { + void* output = outputs[i]; + switch( output_keys[i] ) + { + case UMAMI_OUT_MATRIX_ELEMENT: + m2_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_AMP2: + amp2_out = static_cast( output ); + break; + case UMAMI_OUT_COLOR_INDEX: + color_out = static_cast( output ); + break; + case UMAMI_OUT_HELICITY_INDEX: + helicity_out = static_cast( output ); + break; + case UMAMI_OUT_DIAGRAM_INDEX: + diagram_out = static_cast( output ); + break; +#ifdef MGONGPUCPP_GPUIMPL + case UMAMI_OUT_GPU_STREAM: + gpu_stream = static_cast( output ); + break; +#endif + default: + return UMAMI_ERROR_UNSUPPORTED_OUTPUT; + } + } + +#ifdef MGONGPUCPP_GPUIMPL + std::size_t n_threads = 256; + std::size_t n_blocks = ( count + n_threads - 1 ) / n_threads; + std::size_t rounded_count = n_blocks * n_threads; + + fptype *momenta, *couplings, *g_s, *helicity_random, *color_random, *diagram_random, *color_jamps; + fptype *matrix_elements, *numerators, *denominators, *ghel_matrix_elements, *ghel_jamps; + int *helicity_index, *color_index; + unsigned int* diagram_index; + + std::size_t n_coup = mg5amcGpu::Parameters_sm_dependentCouplings::ndcoup; + gpuMallocAsync( &momenta, rounded_count * CPPProcess::npar * 4 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &couplings, rounded_count * n_coup * 2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &g_s, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &color_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_random, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &matrix_elements, rounded_count * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &diagram_index, rounded_count * sizeof( unsigned int ), gpu_stream ); + gpuMallocAsync( &color_jamps, rounded_count * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &numerators, rounded_count * CPPProcess::ndiagrams * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &denominators, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &helicity_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &color_index, rounded_count * sizeof( int ), gpu_stream ); + gpuMallocAsync( &ghel_matrix_elements, rounded_count * CPPProcess::ncomb * sizeof( fptype ), gpu_stream ); + gpuMallocAsync( &ghel_jamps, rounded_count * CPPProcess::ncomb * CPPProcess::ncolor * mgOnGpu::nx2 * sizeof( fptype ), gpu_stream ); + + copy_inputs<<>>( + momenta_in, + random_helicity_in, + random_color_in, + random_diagram_in, + alpha_s_in, + momenta, + helicity_random, + color_random, + diagram_random, + g_s, + count, + stride, + offset ); + computeDependentCouplings<<>>( g_s, couplings ); + checkGpu( gpuPeekAtLastError() ); + // TODO: make things fully async (requires using events instead of synchronize in + // the sigmaKin implementation) + gpuStreamSynchronize( gpu_stream ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta, couplings, matrix_elements, color_jamps, numerators, denominators, rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta, + couplings, + helicity_random, + color_random, + nullptr, + diagram_random, + matrix_elements, + helicity_index, + color_index, + color_jamps, + numerators, + denominators, + diagram_index, + false, + ghel_matrix_elements, + ghel_jamps, + nullptr, + nullptr, + instance->hel_streams, + n_blocks, + n_threads ); + + copy_outputs<<>>( + denominators, + numerators, + matrix_elements, + diagram_index, + color_index, + helicity_index, + m2_out, + amp2_out, + diagram_out, + color_out, + helicity_out, + count, + stride, + offset ); + checkGpu( gpuPeekAtLastError() ); + + gpuFreeAsync( momenta, gpu_stream ); + gpuFreeAsync( couplings, gpu_stream ); + gpuFreeAsync( g_s, gpu_stream ); + gpuFreeAsync( helicity_random, gpu_stream ); + gpuFreeAsync( color_random, gpu_stream ); + gpuFreeAsync( diagram_random, gpu_stream ); + gpuFreeAsync( matrix_elements, gpu_stream ); + gpuFreeAsync( diagram_index, gpu_stream ); + gpuFreeAsync( color_jamps, gpu_stream ); + gpuFreeAsync( numerators, gpu_stream ); + gpuFreeAsync( denominators, gpu_stream ); + gpuFreeAsync( helicity_index, gpu_stream ); + gpuFreeAsync( color_index, gpu_stream ); + gpuFreeAsync( ghel_matrix_elements, gpu_stream ); + gpuFreeAsync( ghel_jamps, gpu_stream ); +#else // MGONGPUCPP_GPUIMPL + // need to round to round to double page size for some reason + std::size_t page_size2 = 2 * MemoryAccessMomentaBase::neppM; + std::size_t rounded_count = ( count + page_size2 - 1 ) / page_size2 * page_size2; + + HostBufferBase momenta( rounded_count * CPPProcess::npar * 4 ); + HostBufferBase couplings( rounded_count * mg5amcCpu::Parameters_sm_dependentCouplings::ndcoup * 2 ); + HostBufferBase g_s( rounded_count ); + HostBufferBase helicity_random( rounded_count ); + HostBufferBase color_random( rounded_count ); + HostBufferBase diagram_random( rounded_count ); + HostBufferBase matrix_elements( rounded_count ); + HostBufferBase diagram_index( rounded_count ); + HostBufferBase numerators( rounded_count * CPPProcess::ndiagrams ); + HostBufferBase denominators( rounded_count ); + HostBufferBase helicity_index( rounded_count ); + HostBufferBase color_index( rounded_count ); + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + transpose_momenta( &momenta_in[offset], momenta.data(), i_event, stride ); + helicity_random[i_event] = random_helicity_in ? random_helicity_in[i_event + offset] : 0.5; + color_random[i_event] = random_color_in ? random_color_in[i_event + offset] : 0.5; + diagram_random[i_event] = random_diagram_in ? random_diagram_in[i_event + offset] : 0.5; + g_s[i_event] = alpha_s_in ? sqrt( 4 * M_PI * alpha_s_in[i_event + offset] ) : 1.2177157847767195; + } + computeDependentCouplings( g_s.data(), couplings.data(), rounded_count ); + + InterfaceInstance* instance = static_cast( handle ); + if( !instance->initialized ) + { + initialize( + momenta.data(), + couplings.data(), + matrix_elements.data(), + numerators.data(), + denominators.data(), + rounded_count ); + instance->initialized = true; + } + + sigmaKin( + momenta.data(), + couplings.data(), + helicity_random.data(), + color_random.data(), + nullptr, + diagram_random.data(), + matrix_elements.data(), + helicity_index.data(), + color_index.data(), + numerators.data(), + denominators.data(), + diagram_index.data(), + false, + rounded_count ); + + std::size_t page_size = MemoryAccessMomentaBase::neppM; + for( std::size_t i_event = 0; i_event < count; ++i_event ) + { + std::size_t i_page = i_event / page_size; + std::size_t i_vector = i_event % page_size; + + double denominator = denominators[i_event]; + if( m2_out != nullptr ) + { + m2_out[i_event + offset] = matrix_elements[i_event]; + } + if( amp2_out != nullptr ) + { + for( std::size_t i_diag = 0; i_diag < CPPProcess::ndiagrams; ++i_diag ) + { + amp2_out[stride * i_diag + i_event + offset] = numerators[i_page * page_size * CPPProcess::ndiagrams + i_diag * page_size + i_vector] / denominator; + } + } + if( diagram_out != nullptr ) + { + diagram_out[i_event + offset] = diagram_index[i_event] - 1; + } + if( color_out != nullptr ) + { + color_out[i_event + offset] = color_index[i_event] - 1; + } + if( helicity_out != nullptr ) + { + helicity_out[i_event + offset] = helicity_index[i_event] - 1; + } + } +#endif // MGONGPUCPP_GPUIMPL + return UMAMI_SUCCESS; + } + + UmamiStatus umami_free( UmamiHandle handle ) + { + InterfaceInstance* instance = static_cast( handle ); +#ifdef MGONGPUCPP_GPUIMPL + for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ ) + { + if( instance->hel_streams[ihel] ) gpuStreamDestroy( instance->hel_streams[ihel] ); + } +#endif + delete instance; + return UMAMI_SUCCESS; + } +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h new file mode 100644 index 0000000000..39ac6fe385 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/umami.h @@ -0,0 +1,212 @@ +/* + * _ + * (_) + * _ _ _ __ ___ __ _ _ __ ___ _ + * | | | | '_ ` _ \ / _` | '_ ` _ \| | + * | |_| | | | | | | (_| | | | | | | | + * \__,_|_| |_| |_|\__,_|_| |_| |_|_| + * + * Unified MAtrix eleMent Interface + * + * + */ + +#ifndef UMAMI_HEADER +#define UMAMI_HEADER 1 + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + + /** + * Major version number of the UMAMI interface. If the major version is the same + * between caller and implementation, binary compatibility is ensured. + */ + const inline int UMAMI_MAJOR_VERSION = 1; + + /** + * Minor version number of the UMAMI interface. Between minor versions, new keys for + * errors, devices, metadata, inputs and outputs can be added. + */ + const inline int UMAMI_MINOR_VERSION = 0; + + typedef enum + { + UMAMI_SUCCESS, + UMAMI_ERROR, + UMAMI_ERROR_NOT_IMPLEMENTED, + UMAMI_ERROR_UNSUPPORTED_INPUT, + UMAMI_ERROR_UNSUPPORTED_OUTPUT, + UMAMI_ERROR_UNSUPPORTED_META, + UMAMI_ERROR_MISSING_INPUT, + } UmamiStatus; + + typedef enum + { + UMAMI_DEVICE_CPU, + UMAMI_DEVICE_CUDA, + UMAMI_DEVICE_HIP, + } UmamiDevice; + + typedef enum + { + UMAMI_META_DEVICE, + UMAMI_META_PARTICLE_COUNT, + UMAMI_META_DIAGRAM_COUNT, + UMAMI_META_HELICITY_COUNT, + UMAMI_META_COLOR_COUNT, + } UmamiMetaKey; + + typedef enum + { + UMAMI_IN_MOMENTA, + UMAMI_IN_ALPHA_S, + UMAMI_IN_FLAVOR_INDEX, + UMAMI_IN_RANDOM_COLOR, + UMAMI_IN_RANDOM_HELICITY, + UMAMI_IN_RANDOM_DIAGRAM, + UMAMI_IN_HELICITY_INDEX, + UMAMI_IN_DIAGRAM_INDEX, + } UmamiInputKey; + + typedef enum + { + UMAMI_OUT_MATRIX_ELEMENT, + UMAMI_OUT_DIAGRAM_AMP2, + UMAMI_OUT_COLOR_INDEX, + UMAMI_OUT_HELICITY_INDEX, + UMAMI_OUT_DIAGRAM_INDEX, + UMAMI_OUT_GPU_STREAM, + // NLO: born, virtual, poles, counterterms + // color: LC-ME, FC-ME + } UmamiOutputKey; + + typedef void* UmamiHandle; + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param meta_key + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_meta( UmamiMetaKey meta_key, void* result ); + + /** + * Creates an instance of the matrix element. Each instance is independent, so thread + * safety can be achieved by creating a separate one for every thread. + * + * @param param_card_path + * path to the parameter file + * @param handle + * pointer to an instance of the subprocess. Has to be cleaned up by + * the caller with `free_subprocess`. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_initialize( UmamiHandle* handle, char const* param_card_path ); + + /** + * Sets the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * real part of the parameter value + * @param parameter_imag + * imaginary part of the parameter value. Ignored for real valued parameters. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_set_parameter( + UmamiHandle handle, + char const* name, + double parameter_real, + double parameter_imag ); + + /** + * Retrieves the value of a model parameter + * + * @param handle + * handle of a matrix element instance + * @param name + * name of the parameter + * @param parameter_real + * pointer to double to return real part of the parameter value + * @param parameter_imag + * pointer to double to return imaginary part of the parameter value. Ignored + * for real-valued parameters (i.e. you may pass a null pointer) + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_get_parameter( + UmamiHandle handle, + char const* name, + double* parameter_real, + double* parameter_imag ); + + /** + * Evaluates the matrix element as a function of the given inputs, filling the + * requested outputs. + * + * @param handle + * handle of a matrix element instance + * @param count + * number of events to evaluate the matrix element for + * @param stride + * stride of the batch dimension of the input and output arrays, see memory layout + * @param offset + * offset of the event index + * @param input_count + * number of inputs to the matrix element + * @param input_keys + * pointer to an array of input keys, length `input_count` + * @param inputs + * pointer to an array of void pointers to the inputs. The type of the inputs + * depends on the input key + * @param output_count + * number of outputs to the matrix element + * @param output_keys + * pointer to an array of output keys, length `output_count` + * @param outputs + * pointer to an array of void pointers to the outputs. The type of the outputs + * depends on the output key. The caller is responsible for allocating memory for + * the outputs. + * @return + * UMAMI_SUCCESS on success, error code otherwise + */ + UmamiStatus umami_matrix_element( + UmamiHandle handle, + size_t count, + size_t stride, + size_t offset, + size_t input_count, + UmamiInputKey const* input_keys, + void const* const* inputs, + size_t output_count, + UmamiOutputKey const* output_keys, + void* const* outputs ); + + /** + * Frees matrix element instance + * + * @param handle + * handle of a matrix element instance + */ + UmamiStatus umami_free( UmamiHandle handle ); + +#ifdef __cplusplus +} +#endif + +#endif // UMAMI_HEADER